JuliaGenAI · splendidbug · Aug 16, 2024 · Jul 10, 2024 · Aug 11, 2024 · Aug 11, 2024
diff --git a/Project.toml b/Project.toml
@@ -14,6 +14,7 @@ Inflate = "d25df0c9-e2be-5dd7-82c8-3ad0b3e990b9"
 JSON = "682c06a0-de6a-54ab-a142-c8b1cf79cde6"
 LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
 PromptingTools = "670122d1-24a8-4d70-bfce-740807c42192"
+Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
 SHA = "ea8e919c-243c-51af-8825-aaa63cd721ce"
 Serialization = "9e88b42a-f829-5b0c-bbe9-9e923198166b"
 SparseArrays = "2f01184e-e22b-5df5-ae63-d93ebab69eaf"
@@ -30,8 +31,10 @@ Gumbo = "0.8"
 HDF5 = "0.17"
 HTTP = "1.10"
 Inflate = "0.1"
+JSON = "0.21"
 LinearAlgebra = "1"
-PromptingTools = "0.48"
+PromptingTools = "0.49"
+Random = "1"
 SHA = "0.7"
 Serialization = "1"
 SparseArrays = "1"
@@ -40,7 +43,6 @@ Test = "1"
 URIs = "1.5"
 Unicode = "1"
 julia = "1.10"
-JSON = "0.21"
 
 [extras]
 Aqua = "4c88cf16-eb10-579e-8560-4a9242c79595"

diff --git a/src/DocsScraper.jl b/src/DocsScraper.jl
@@ -13,6 +13,7 @@ using SHA
 using Serialization, URIs
 using Dates
 using JSON
+using Random
 
 include("parser.jl")
 include("crawl.jl")

diff --git a/src/make_knowledge_packs.jl b/src/make_knowledge_packs.jl
@@ -38,7 +38,6 @@ Parse URLs from hostname_url_dict and save the chunks
 function make_chunks(hostname_url_dict::Dict{AbstractString, Vector{AbstractString}},
         knowledge_pack_path::String; max_chunk_size::Int = MAX_CHUNK_SIZE,
         min_chunk_size::Int = MIN_CHUNK_SIZE)
-    SAVE_CHUNKS = true
     for (hostname, urls) in hostname_url_dict
         output_chunks = Vector{SubString{String}}()
         output_sources = Vector{String}()
@@ -52,16 +51,14 @@ function make_chunks(hostname_url_dict::Dict{AbstractString, Vector{AbstractStri
                 @error "error!! check url: $url"
             end
         end
-        if SAVE_CHUNKS
-            serialize(
-                joinpath(knowledge_pack_path,
-                    "$(hostname)-chunks-max-$(max_chunk_size)-min-$(min_chunk_size).jls"),
-                output_chunks)
-            serialize(
-                joinpath(knowledge_pack_path,
-                    "$(hostname)-sources-max-$(max_chunk_size)-min-$(min_chunk_size).jls"),
-                output_sources)
-        end
+        serialize(
+            joinpath(knowledge_pack_path,
+                "$(hostname)-chunks-max-$(max_chunk_size)-min-$(min_chunk_size).jls"),
+            output_chunks)
+        serialize(
+            joinpath(knowledge_pack_path,
+                "$(hostname)-sources-max-$(max_chunk_size)-min-$(min_chunk_size).jls"),
+            output_sources)
     end
 end
 
@@ -87,19 +84,25 @@ end
 
 """
     generate_embeddings(knowledge_pack_path::String; model::AbstractString=MODEL, 
-        embedding_size::Int=EMBEDDING_SIZE, custom_metadata::AbstractString)
+        embedding_size::Int=EMBEDDING_SIZE, custom_metadata::AbstractString,
+        bool_embeddings::Bool = true, index_name::AbstractString = "")
 
 Deserialize chunks and sources to generate embeddings 
+Note: We highly recommend to pass `index_name`. This will be the name of the generated index. Default: date-randomInt
+
 
 # Arguments
 - model: Embedding model
 - embedding_size: Embedding dimensions
 - custom_metadata: Custom metadata like ecosystem name if required
+- bool_embeddings: If true, embeddings generated will be boolean, Float32 otherwise
+- index_name: Name if the index. Default: date-randomInt
 """
 function generate_embeddings(
         knowledge_pack_path::String; max_chunk_size::Int = MAX_CHUNK_SIZE,
         model::AbstractString = MODEL,
-        embedding_size::Int = EMBEDDING_SIZE, custom_metadata::AbstractString)
+        embedding_size::Int = EMBEDDING_SIZE, custom_metadata::AbstractString,
+        bool_embeddings::Bool = true, index_name::AbstractString = "")
     embedder = RT.BatchEmbedder()
     entries = readdir(knowledge_pack_path)
     # Initialize a dictionary to group files by hostname and chunk size
@@ -109,9 +112,6 @@ function generate_embeddings(
     chunks_pattern = r"^(.*)-chunks-max-(\d+)-min-(\d+)\.jls$"
     sources_pattern = r"^(.*)-sources-max-(\d+)-min-(\d+)\.jls$"
 
-    # chunks_pattern = r"^(.*)-chunks-(\d+)\.jls$"
-    # sources_pattern = r"^(.*)-sources-(\d+)\.jls$"
-
     # Group files by hostname and chunk size
     for file in entries
         match_chunks = match(chunks_pattern, file)
@@ -141,62 +141,85 @@ function generate_embeddings(
                 knowledge_pack_path, file)
         end
     end
-    # Process each pair of files
+
+    chunks = Vector{SubString{String}}()
+    sources = Vector{String}()
+
+    # Add chunks and sources to vectors from each of the scraped file
+
     for (hostname, chunk_files) in hostname_files
         for (max_chunk_size, files) in chunk_files
             if haskey(files, "chunks") && haskey(files, "sources")
                 chunks_file = files["chunks"]
                 sources_file = files["sources"]
-                chunks = deserialize(chunks_file)
-                sources = deserialize(sources_file)
-                cost_tracker = Threads.Atomic{Float64}(0.0)
-                full_embeddings = RT.get_embeddings(
-                    embedder, chunks; model, verbose = false, cost_tracker)
-                @info "Created embeddings for $hostname. Cost: \$$(round(cost_tracker[], digits=3))"
-
-                trunc = embedding_size < EMBEDDING_SIZE ? 1 : 0
-                fn_output = joinpath(knowledge_pack_path, "packs",
-                    "$hostname-$model-$trunc-Float32__v1.0.tar.gz")
-                fn_temp = joinpath(knowledge_pack_path, "packs",
-                    "$hostname-$model-$trunc-Float32__v1.0.hdf5")
-
-                h5open(fn_temp, "w") do file
-                    file["chunks"] = chunks
-                    file["sources"] = sources
-                    file["embeddings"] = full_embeddings[1:embedding_size, :] |>
-                                         l2_norm_columns |> x -> map(>(0), x)
-                    file["type"] = "ChunkIndex"
-
-                    package_url_dict = Dict{String, Vector{String}}()
-                    package_url_dict = urls_for_metadata(sources)
-
-                    metadata = Dict(
-                        :embedded_dt => Dates.today(),
-                        :custom_metadata => custom_metadata, :max_chunk_size => max_chunk_size,
-                        :embedding_size => embedding_size, :model => model,
-                        :packages => package_url_dict)
-
-                    metadata_json = JSON.json(metadata)
-                    file["metadata"] = metadata_json
-                end
-
-                command = `tar -cvzf $fn_output -C $(dirname(fn_temp)) $(basename(fn_temp))`
-                run(command)
-                report_artifact(fn_output)
+                append!(chunks, deserialize(chunks_file))
+                append!(sources, deserialize(sources_file))
 
             else
                 @warn "Missing pair for hostname: $hostname, max chunk size: $max_chunk_size"
             end
         end
     end
+
+    # Generate embeddings
+    cost_tracker = Threads.Atomic{Float64}(0.0)
+    full_embeddings = RT.get_embeddings(
+        embedder, chunks; model, verbose = false, cost_tracker)
+
+    full_embeddings = full_embeddings[1:embedding_size, :] |>
+                      l2_norm_columns
+
+    if bool_embeddings
+        full_embeddings = map(>(0), full_embeddings)
+    end
+
+    if isempty(index_name)
+        rand_int = rand(1000:100000)
+        date = Dates.today()
+        index_name = "$(date)-$(rand_int)"
+    end
+
+    @info "Created embeddings for $index_name. Cost: \$$(round(cost_tracker[], digits=3))"
+
+    trunc = embedding_size < EMBEDDING_SIZE ? 1 : 0
+    emb_data_type = bool_embeddings ? "Bool" : "Float32"
+
+    fn_output = joinpath(knowledge_pack_path, "packs",
+        "$index_name-$model-$trunc-$(emb_data_type)__v1.0.tar.gz")
+    fn_temp = joinpath(knowledge_pack_path, "packs",
+        "$index_name-$model-$trunc-$(emb_data_type)__v1.0.hdf5")
+
+    h5open(fn_temp, "w") do file
+        file["chunks"] = chunks
+        file["sources"] = sources
+        file["embeddings"] = full_embeddings
+        file["type"] = "ChunkIndex"
+
+        package_url_dict = Dict{String, Vector{String}}()
+        package_url_dict = urls_for_metadata(sources)
+
+        metadata = Dict(
+            :embedded_dt => Dates.today(),
+            :custom_metadata => custom_metadata, :max_chunk_size => max_chunk_size,
+            :embedding_size => embedding_size, :model => model,
+            :packages => package_url_dict)
+
+        metadata_json = JSON.json(metadata)
+        file["metadata"] = metadata_json
+    end
+
+    command = `tar -cvzf $fn_output -C $(dirname(fn_temp)) $(basename(fn_temp))`
+    run(command)
+    report_artifact(fn_output)
 end
 
 """
     make_knowledge_packs(crawlable_urls::Vector{<:AbstractString}=String[]; single_urls::Vector{<:AbstractString}=String[],
         max_chunk_size::Int=MAX_CHUNK_SIZE, min_chunk_size::Int=MIN_CHUNK_SIZE, model::AbstractString=MODEL, embedding_size::Int=EMBEDDING_SIZE, 
-        custom_metadata::AbstractString)
+        custom_metadata::AbstractString, bool_embeddings::Bool = true, index_name::AbstractString = "")
 
-Entry point to crawl, parse and generate embeddings
+Entry point to crawl, parse and generate embeddings.
+Note: We highly recommend to pass `index_name`. This will be the name of the generated index. Default: date-randomInt
 
 # Arguments
 - crawlable_urls: URLs that should be crawled to find more links
@@ -206,11 +229,14 @@ Entry point to crawl, parse and generate embeddings
 - model: Embedding model
 - embedding_size: Embedding dimensions
 - custom_metadata: Custom metadata like ecosystem name if required
+- bool_embeddings: If true, embeddings generated will be boolean, Float32 otherwise
+- index_name: Name if the index. Default: date-randomInt
 """
 function make_knowledge_packs(crawlable_urls::Vector{<:AbstractString} = String[];
         single_urls::Vector{<:AbstractString} = String[],
         max_chunk_size::Int = MAX_CHUNK_SIZE, min_chunk_size::Int = MIN_CHUNK_SIZE,
-        model::AbstractString = MODEL, embedding_size::Int = EMBEDDING_SIZE, custom_metadata::AbstractString = "")
+        model::AbstractString = MODEL, embedding_size::Int = EMBEDDING_SIZE, custom_metadata::AbstractString = "",
+        bool_embeddings::Bool = true, index_name::AbstractString = "")
     if isempty(crawlable_urls) && isempty(single_urls)
         error("At least one of `input_urls` or `single_pages` must be provided.")
     end
@@ -241,5 +267,7 @@ function make_knowledge_packs(crawlable_urls::Vector{<:AbstractString} = String[
     make_chunks(
         hostname_url_dict, knowledge_pack_path; max_chunk_size, min_chunk_size)
     generate_embeddings(
-        knowledge_pack_path; max_chunk_size, model, embedding_size, custom_metadata)
+        knowledge_pack_path; max_chunk_size, model, embedding_size,
+        custom_metadata, bool_embeddings, index_name)
+
 end