JuliaGenAI · codetalker7 · Sep 27, 2024 · Sep 22, 2024 · Sep 22, 2024 · Sep 22, 2024
@@ -4,8 +4,6 @@ authors = ["Siddhant Chaudhary <[email protected]> and contributors"]
 version = "0.1.0"
 
 [deps]
-CSV = "336ed68f-0bac-5ca0-87d4-7b16caf5d00b"
-Clustering = "aaaa29a8-35af-508c-8bc3-b662a17a0fe5"
 Dates = "ade2ca70-3891-5945-98fb-dc099432e06a"
 Flux = "587475ba-b771-5e3f-ad9e-33799f191a9c"
 JLD2 = "033835bb-8acc-5ee8-8aae-3f567f8a3819"
@@ -19,4 +17,11 @@ TextEncodeBase = "f92c20c0-9f2a-4705-8116-881385faba05"
 Transformers = "21ca0261-441d-5938-ace7-c90938fde4d4"
 
 [compat]
+Flux = "0.14"
+JLD2 = "0.4" 
+JSON = "0.21" 
+NeuralAttentionlib = "0.3"
+StatsBase = "0.34"
+TextEncodeBase = "0.8" 
+Transformers = "0.3"
 julia = "1.10"
@@ -6,18 +6,20 @@ DocMeta.setdocmeta!(ColBERT, :DocTestSetup, :(using ColBERT); recursive = true)
 makedocs(;
     modules = [ColBERT],
     authors = "Siddhant Chaudhary <[email protected]> and contributors",
-    sitename = "ColBERT.jl",
+    sitename = "ColBERT",
     format = Documenter.HTML(;
-        canonical = "https://codetalker7.github.io/ColBERT.jl",
-        edit_link = "main",
-        assets = String[]
+        assets = String[],
+        sidebar_sitename = false
     ),
     pages = [
-        "Home" => "index.md"
+        "Home" => "index.md",
+        "Reference" => "api.md"
     ]
 )
 
 deploydocs(;
     repo = "github.com/codetalker7/ColBERT.jl",
-    devbranch = "main"
+    target = "build",
+    devbranch = "main",
+    push_preview = true
 )
@@ -0,0 +1,6 @@
+```@index
+```
+
+```@autodocs
+Modules = [ColBERT]
+```
@@ -2,13 +2,209 @@
 CurrentModule = ColBERT
 ```
 
-# ColBERT
+# ColBERT: Efficient, late-interaction retrieval systems in Julia!
 
-Documentation for [ColBERT](https://github.com/codetalker7/ColBERT.jl).
+[ColBERT.jl](https://codetalker7/colbert.jl) is a pure Julia package for the ColBERT
+information retrieval system123, allowing developers to integrate this powerful neural
+retrieval algorithm into their own downstream tasks. ColBERT (contextualized late
+interaction over BERT) has emerged as a state-of-the-art approach for efficient and
+effective document retrieval, thanks to its ability to leverage contextualized
+embeddings from pre-trained language models like BERT.
 
-```@index
+[Inspired from the original Python implementation of ColBERT](https://github.com/stanford-futuredata/ColBERT),
+with ColBERT.jl, you can now bring this capability to your Julia
+applications, whether you're working on natural language processing
+tasks, information retrieval systems, or other areas where relevant document
+retrieval is crucial. Our package provides a simple and intuitive interface for
+using ColBERT in Julia, making it easy to get started with this powerful algorithm.
+
+# Get Started
+
+To install the package, simply clone the repository and `dev` it:
+
+```julia-repl
+julia> ] dev .
+```
+
+Consult the [README](https://github.com/JuliaGenAI/ColBERT.jl) of the GitHub repository
+for a small example. In this guide, we'll index a collection of 1000 documents.
+
+## Dataset and preprocessing
+
+We'll go through an example of the `lifestyle/dev` split of the
+[LoTTe](https://github.com/stanford-futuredata/colbert/blob/main/lotte.md) dataset.
+To download the dataset, you can use the `examples/lotte.sh` script. We'll work
+with the first 1000 documents of the dataset:
+
+```shell
+$ cd examples
+$ ./lotte.sh
+$ head -n 1000 downloads/lotte/lifestyle/dev/collection.tsv > 1kcollection.tsv
+$ wc -l 1kcollection.tsv
+1000 1kcollection.txt
 ```
 
-```@autodocs
-Modules = [ColBERT]
+The `1kcollection.tsv` file has documents in the format `pid \t <document text>`, where
+`pid` is the unique ID of the document. For now, the package only supports collections
+which have one document per line. So, we'll simply remove the `pid` from each document
+in `1kcollection.tsv`, and save the resultant file of documents in `1kcollection.txt`.
+Here's a simple Julia script you can use to do this preprocessing using the
+[CSV.jl](https://github.com/JuliaData/CSV.jl) package:
+
+```julia
+using CSV
+file = CSV.File("1kcollection.tsv"; delim = '\t', header = [:pid, :text],
+        types = Dict(:pid => Int, :text => String), debug = true, quoted = false)
+for doc in file.text
+    open("1kcollection.txt", "a") do io
+        write(io, doc*"\n")
+    end
+end
+```
+
+We now have our collection of documents to index!
+
+## The `ColBERTConfig`
+
+To start off, make sure you download a ColBERT checkpoint somewhere in your system;
+for this example, I'll download the `colbert-ir/colbertv2.0` checkpoint in `$HOME/models`:
+
+```shell
+git lfs install
+cd $HOME/models
+git clone https://huggingface.co/colbert-ir/colbertv2.0
+```
+
+The next step is to create a configuration object containing details about all parameters used
+during indexing/searching using ColBERT. All this information is contained in a type called
+`ColBERTConfig`. Creating a `ColBERTConfig` is easy; it has the right defaults for most users,
+and one can change the settings using simple kwargs. In this example, we'll create a config
+for the collection `1kcollection.txt` we just created, and we'll also use
+[CUDA.jl](https://github.com/JuliaGPU/CUDA.jl) for GPU support (you can use
+any GPU backend supported by [Flux.jl](https://github.com/FluxML/Flux.jl))!
+
+```julia-repl
+julia>  using ColBERT, CUDA, Random;
+
+julia>  Random.seed!(0)                                                 # global seed for a reproducible index
+
+julia>  config = ColBERTConfig(
+            use_gpu = true,
+            checkpoint = "/home/codetalker7/models/colbertv2.0",        # local path to the colbert checkpoint
+            collection = "./1kcollection.txt",                          # local path to the collection
+            doc_maxlen = 300,                                           # max length beyond which docs are truncated
+            index_path = "./1kcollection_index/",                       # local directory to save the index in
+            chunksize = 200                                             # number of docs to store in a chunk
+        );
+```
+
+You can read more about a `ColBERTConfig` from it's docstring.
+
+## Building the index
+
+Building the index is even easier than creating a config; just build an `Indexer` and call the
+`index` function. I used an NVIDIA GeForce RTX 2020 Ti card to build the index:
+
+```julia-repl
+julia>  indexer = Indexer(config);
+
+julia>  @time index(indexer)
+[ Info: # of sampled PIDs = 636
+[ Info: Encoding 636 passages.
+[ Info: avg_doclen_est = 233.25157232704402      length(local_sample) = 636
+[ Info: Creating 4096 clusters.
+[ Info: Estimated 233251.572327044 embeddings.
+[ Info: Saving the index plan to ./1kcollection_index/plan.json.
+[ Info: Saving the config to the indexing path.
+[ Info: Training the clusters.
+[ Info: Iteration 1/20, max delta: 0.26976448
+[ Info: Iteration 2/20, max delta: 0.17742664
+[ Info: Iteration 3/20, max delta: 0.16281573
+[ Info: Iteration 4/20, max delta: 0.120501295
+[ Info: Iteration 5/20, max delta: 0.08808214
+[ Info: Iteration 6/20, max delta: 0.14226294
+[ Info: Iteration 7/20, max delta: 0.07096822
+[ Info: Iteration 8/20, max delta: 0.081315234
+[ Info: Iteration 9/20, max delta: 0.06760075
+[ Info: Iteration 10/20, max delta: 0.07043305
+[ Info: Iteration 11/20, max delta: 0.060436506
+[ Info: Iteration 12/20, max delta: 0.048092205
+[ Info: Iteration 13/20, max delta: 0.052080974
+[ Info: Iteration 14/20, max delta: 0.055756018
+[ Info: Iteration 15/20, max delta: 0.057068985
+[ Info: Iteration 16/20, max delta: 0.05717972
+[ Info: Iteration 17/20, max delta: 0.02952642
+[ Info: Iteration 18/20, max delta: 0.025388952
+[ Info: Iteration 19/20, max delta: 0.034007154
+[ Info: Iteration 20/20, max delta: 0.047712516
+[ Info: Got bucket_cutoffs_quantiles = [0.25, 0.5, 0.75] and bucket_weights_quantiles = [0.125, 0.375, 0.625, 0.875]
+[ Info: Got bucket_cutoffs = Float32[-0.023658333, -9.9312514f-5, 0.023450013] and bucket_weights = Float32[-0.044035435, -0.010775891, 0.010555617, 0.043713447]
+[ Info: avg_residual = 0.031616904
+[ Info: Saving codec to ./1kcollection_index/centroids.jld2, ./1kcollection_index/avg_residual.jld2, ./1kcollection_index/bucket_cutoffs.jld2 and ./1kcollection_index/bucket_weights.jld2.
+[ Info: Building the index.
+[ Info: Loading codec from ./1kcollection_index/centroids.jld2, ./1kcollection_index/avg_residual.jld2, ./1kcollection_index/bucket_cutoffs.jld2 and ./1kcollection_index/bucket_weights.jld2.
+[ Info: Encoding 200 passages.
+[ Info: Saving chunk 1:          200 passages and 36218 embeddings. From passage #1 onward.
+[ Info: Saving compressed codes to ./1kcollection_index/1.codes.jld2 and residuals to ./1kcollection_index/1.residuals.jld2
+[ Info: Saving doclens to ./1kcollection_index/doclens.1.jld2
+[ Info: Saving metadata to ./1kcollection_index/1.metadata.json
+[ Info: Encoding 200 passages.
+[ Info: Saving chunk 2:          200 passages and 45064 embeddings. From passage #201 onward.
+[ Info: Saving compressed codes to ./1kcollection_index/2.codes.jld2 and residuals to ./1kcollection_index/2.residuals.jld2
+[ Info: Saving doclens to ./1kcollection_index/doclens.2.jld2
+[ Info: Saving metadata to ./1kcollection_index/2.metadata.json
+[ Info: Encoding 200 passages.
+[ Info: Saving chunk 3:          200 passages and 50956 embeddings. From passage #401 onward.
+[ Info: Saving compressed codes to ./1kcollection_index/3.codes.jld2 and residuals to ./1kcollection_index/3.residuals.jld2
+[ Info: Saving doclens to ./1kcollection_index/doclens.3.jld2
+[ Info: Saving metadata to ./1kcollection_index/3.metadata.json
+[ Info: Encoding 200 passages.
+[ Info: Saving chunk 4:          200 passages and 49415 embeddings. From passage #601 onward.
+[ Info: Saving compressed codes to ./1kcollection_index/4.codes.jld2 and residuals to ./1kcollection_index/4.residuals.jld2
+[ Info: Saving doclens to ./1kcollection_index/doclens.4.jld2
+[ Info: Saving metadata to ./1kcollection_index/4.metadata.json
+[ Info: Encoding 200 passages.
+[ Info: Saving chunk 5:          200 passages and 52304 embeddings. From passage #801 onward.
+[ Info: Saving compressed codes to ./1kcollection_index/5.codes.jld2 and residuals to ./1kcollection_index/5.residuals.jld2
+[ Info: Saving doclens to ./1kcollection_index/doclens.5.jld2
+[ Info: Saving metadata to ./1kcollection_index/5.metadata.json
+[ Info: Running some final checks.
+[ Info: Checking if all files are saved.
+[ Info: Found all files!
+[ Info: Collecting embedding ID offsets.
+[ Info: Saving the indexing metadata.
+[ Info: Building the centroid to embedding IVF.
+[ Info: Loading codes for each embedding.
+[ Info: Sorting the codes.
+[ Info: Getting unique codes and their counts.
+[ Info: Saving the IVF.
+151.833047 seconds (78.15 M allocations: 28.871 GiB, 41.12% gc time, 0.51% compilation time: <1% of which was recompilation)
+```
+
+## Searching
+
+Once you've built the index for your collection of docs, it's now time to perform a query search.
+This involves creating a `Searcher` from the path of the index:
+
+```julia-repl
+julia>  using ColBERT, CUDA;
+
+julia>  searcher = Searcher("1kcollection_index");
+```
+
+Next, simply feed a query to the search function, and get the top-k best documents for your query:
+
+```julia-repl
+julia>  query = "what is 1080 fox bait poisoning?";
+
+julia>  @time pids, scores = search(searcher, query, 10)            # second run statistics
+  0.136773 seconds (1.95 M allocations: 240.648 MiB, 0.00% compilation time)
+([999, 383, 386, 323, 547, 385, 384, 344, 963, 833], Float32[8.754782, 7.6871076, 6.8440857, 6.365711, 6.323611, 6.1222105, 5.92911, 5.708316, 5.597268, 5.4987035])
+```
+
+You can now use these pids to see which documents match the best against your query:
+
+```julia-repl
+julia> print(readlines("1kcollection.txt")[pids[1]])
+Tl;dr - Yes, it sounds like a possible 1080 fox bait poisoning. Can't be sure though. The traditional fox bait is called 1080. That poisonous bait is still used in a few countries to kill foxes, rabbits, possums and other mammal pests. The toxin in 1080 is Sodium fluoroacetate. Wikipedia is a bit vague on symptoms in animals, but for humans they say: In humans, the symptoms of poisoning normally appear between 30 minutes and three hours after exposure. Initial symptoms typically include nausea, vomiting and abdominal pain; sweating, confusion and agitation follow. In significant poisoning, cardiac abnormalities including tachycardia or bradycardia, hypotension and ECG changes develop. Neurological effects include muscle twitching and seizures... One might safely assume a dog, especially a small Whippet, would show symptoms of poisoning faster than the 30 mins stated for humans. The listed (human) symptoms look like a good fit to what your neighbour reported about your dog. Strychnine is another commonly used poison against mammal pests. It affects the animal's muscles so that contracted muscles can no longer relax. That means the muscles responsible of breathing cease to operate and the animal suffocates to death in less than two hours. This sounds like unlikely case with your dog. One possibility is unintentional pet poisoning by snail/slug baits. These baits are meant to control a population of snails and slugs in a garden. Because the pelletized bait looks a lot like dry food made for dogs it is easily one of the most common causes of unintentional poisoning of dogs. The toxin in these baits is Metaldehyde and a dog may die inside four hours of ingesting these baits, which sounds like too slow to explain what happened to your dog, even though the symptoms of this toxin are somewhat similar to your case. Then again, the malicious use of poisons against neighbourhood dogs can vary a lot. In fact they don't end with just pesticides but also other harmful matter, like medicine made for humans and even razorblades stuck inside a meatball, have been found in baits. It is quite impossible to say what might have caused the death of your dog, at least without autopsy and toxicology tests. The 1080 is just one of the possible explanations. It is best to always use a leash when walking dogs in populated areas and only let dogs free (when allowed by local legislation) in unpopulated parks and forests and suchlike places.
 ```
@@ -0,0 +1,31 @@
+# the knowledge packs
+wget https://media.githubusercontent.com/media/svilupp/AIHelpMeArtifacts/main/artifacts/genie__v20240818__textembedding3large-1024-Bool__v1.0.tar.gz?download=true \
+    -O genie_knowledge_pack.tar.gz
+wget https://media.githubusercontent.com/media/svilupp/AIHelpMeArtifacts/main/artifacts/julia__v1.10.2__textembedding3large-1024-Bool__v1.0.tar.gz?download=true \
+    -O julia_v1.10.2_knowledge_pack.tar.gz
+wget https://media.githubusercontent.com/media/svilupp/AIHelpMeArtifacts/main/artifacts/juliadata__v20240716__textembedding3large-1024-Bool__v1.0.tar.gz?download=true \
+    -O juliadata_knowledge_pack.tar.gz
+wget https://media.githubusercontent.com/media/svilupp/AIHelpMeArtifacts/main/artifacts/julialang__v20240819__textembedding3large-1024-Bool__v1.0.tar.gz?download=true \
+    -O julialang_knowledge_pack.tar.gz
+wget https://media.githubusercontent.com/media/svilupp/AIHelpMeArtifacts/main/artifacts/makie__v20240716__textembedding3large-1024-Bool__v1.0.tar.gz?download=true \
+    -O makie_knowledge_pack.tar.gz
+wget https://media.githubusercontent.com/media/svilupp/AIHelpMeArtifacts/main/artifacts/plots__v20240716__textembedding3large-1024-Bool__v1.0.tar.gz?download=true \
+    -O plots_knowledge_pack.tar.gz
+wget https://media.githubusercontent.com/media/svilupp/AIHelpMeArtifacts/main/artifacts/sciml__v20240716__textembedding3large-1024-Bool__v1.0.tar.gz?download=true \
+    -O sciml_knowledge_pack.tar.gz
+wget https://media.githubusercontent.com/media/svilupp/AIHelpMeArtifacts/main/artifacts/tidier__v20240716__textembedding3large-1024-Bool__v1.0.tar.gz?download=true \
+    -O tider_knowledge_pack.tar.gz
+
+# unpack all the packs
+tar -xvzf genie_knowledge_pack.tar.gz 
+tar -xvzf julia_v1.10.2_knowledge_pack.tar.gz 
+tar -xvzf juliadata_knowledge_pack.tar.gz 
+tar -xvzf julialang_knowledge_pack.tar.gz 
+tar -xvzf makie_knowledge_pack.tar.gz 
+tar -xvzf plots_knowledge_pack.tar.gz 
+tar -xvzf sciml_knowledge_pack.tar.gz 
+tar -xvzf tider_knowledge_pack.tar.gz 
+
+
+# eval pack
+wget https://raw.githubusercontent.com/svilupp/AIHelpMe.jl/main/evaluations/JuliaData/dataframe_combined_filtered-qa-evals.json -O qa_evals.json
@@ -0,0 +1,43 @@
+# loading the docs
+using HDF5
+
+doc_passages = String[]
+doc_sources = String[]
+
+for file in ["genie__v20240818__textembedding3large-1024-Bool__v1.0.hdf5",
+    "JuliaData-text-embedding-3-large-1-Bool__v1.0.hdf5",
+    "julialang__v20240819__textembedding3large-1024-Bool__v1.0.hdf5",
+    "Makie-text-embedding-3-large-1-Bool__v1.0.hdf5",
+    "pack.hdf5", "Plots-text-embedding-3-large-1-Bool__v1.0.hdf5",
+    "sciml__v20240716__textembedding3large-1024-Bool__v1.0.hdf5",
+    "tidier__v20240716__textembedding3large-1024-Bool__v1.0.hdf5"]
+    fid = h5open(file, "r")
+    chunks, sources = fid["chunks"], fid["sources"]
+    append!(doc_passages, read(chunks))
+    append!(doc_sources, read(sources))
+end
+
+# evals
+using ColBERT, CUDA, Random, JSON;
+using PromptingTools: distance_longest_common_subsequence
+# CUDA.devices()
+# device!(5)
+Random.seed!(0)
+
+## load the evaluation qa
+eval_qa = JSON.parsefile("qa_evals.json")
+
+## get the searcher
+searcher = Searcher("./juliadocsindex/");
+
+## for each qs, see if the context is returned
+k = 5
+num_hits = 0
+for query in eval_qa
+    @time pids, _ = search(searcher, query["question"], k)
+    if minimum(distance_longest_common_subsequence(
+        query["context"], doc_passages[pids])) < 0.33
+        num_hits += 1
+    end
+end
+print("Number of hits: ", num_hits / length(eval_qa))