Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

create a single index #5

Merged
merged 8 commits into from
Aug 16, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 4 additions & 2 deletions Project.toml
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ Inflate = "d25df0c9-e2be-5dd7-82c8-3ad0b3e990b9"
JSON = "682c06a0-de6a-54ab-a142-c8b1cf79cde6"
LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
PromptingTools = "670122d1-24a8-4d70-bfce-740807c42192"
Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
SHA = "ea8e919c-243c-51af-8825-aaa63cd721ce"
Serialization = "9e88b42a-f829-5b0c-bbe9-9e923198166b"
SparseArrays = "2f01184e-e22b-5df5-ae63-d93ebab69eaf"
Expand All @@ -30,8 +31,10 @@ Gumbo = "0.8"
HDF5 = "0.17"
HTTP = "1.10"
Inflate = "0.1"
JSON = "0.21"
LinearAlgebra = "1"
PromptingTools = "0.48"
PromptingTools = "0.49"
Random = "1"
SHA = "0.7"
Serialization = "1"
SparseArrays = "1"
Expand All @@ -40,7 +43,6 @@ Test = "1"
URIs = "1.5"
Unicode = "1"
julia = "1.10"
JSON = "0.21"

[extras]
Aqua = "4c88cf16-eb10-579e-8560-4a9242c79595"
Expand Down
1 change: 1 addition & 0 deletions src/DocsScraper.jl
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ using SHA
using Serialization, URIs
using Dates
using JSON
using Random

include("parser.jl")
include("crawl.jl")
Expand Down
142 changes: 85 additions & 57 deletions src/make_knowledge_packs.jl
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,6 @@ Parse URLs from hostname_url_dict and save the chunks
function make_chunks(hostname_url_dict::Dict{AbstractString, Vector{AbstractString}},
knowledge_pack_path::String; max_chunk_size::Int = MAX_CHUNK_SIZE,
min_chunk_size::Int = MIN_CHUNK_SIZE)
SAVE_CHUNKS = true
for (hostname, urls) in hostname_url_dict
output_chunks = Vector{SubString{String}}()
output_sources = Vector{String}()
Expand All @@ -52,16 +51,14 @@ function make_chunks(hostname_url_dict::Dict{AbstractString, Vector{AbstractStri
@error "error!! check url: $url"
end
end
if SAVE_CHUNKS
serialize(
joinpath(knowledge_pack_path,
"$(hostname)-chunks-max-$(max_chunk_size)-min-$(min_chunk_size).jls"),
output_chunks)
serialize(
joinpath(knowledge_pack_path,
"$(hostname)-sources-max-$(max_chunk_size)-min-$(min_chunk_size).jls"),
output_sources)
end
serialize(
joinpath(knowledge_pack_path,
"$(hostname)-chunks-max-$(max_chunk_size)-min-$(min_chunk_size).jls"),
output_chunks)
serialize(
joinpath(knowledge_pack_path,
"$(hostname)-sources-max-$(max_chunk_size)-min-$(min_chunk_size).jls"),
output_sources)
end
end

Expand All @@ -87,19 +84,25 @@ end

"""
generate_embeddings(knowledge_pack_path::String; model::AbstractString=MODEL,
embedding_size::Int=EMBEDDING_SIZE, custom_metadata::AbstractString)
embedding_size::Int=EMBEDDING_SIZE, custom_metadata::AbstractString,
bool_embeddings::Bool = true, index_name::AbstractString = "")

Deserialize chunks and sources to generate embeddings
Note: We highly recommend to pass `index_name`. This will be the name of the generated index. Default: date-randomInt


# Arguments
- model: Embedding model
- embedding_size: Embedding dimensions
- custom_metadata: Custom metadata like ecosystem name if required
- bool_embeddings: If true, embeddings generated will be boolean, Float32 otherwise
- index_name: Name if the index. Default: date-randomInt
"""
function generate_embeddings(
knowledge_pack_path::String; max_chunk_size::Int = MAX_CHUNK_SIZE,
model::AbstractString = MODEL,
embedding_size::Int = EMBEDDING_SIZE, custom_metadata::AbstractString)
embedding_size::Int = EMBEDDING_SIZE, custom_metadata::AbstractString,
bool_embeddings::Bool = true, index_name::AbstractString = "")
embedder = RT.BatchEmbedder()
entries = readdir(knowledge_pack_path)
# Initialize a dictionary to group files by hostname and chunk size
Expand All @@ -109,9 +112,6 @@ function generate_embeddings(
chunks_pattern = r"^(.*)-chunks-max-(\d+)-min-(\d+)\.jls$"
sources_pattern = r"^(.*)-sources-max-(\d+)-min-(\d+)\.jls$"

# chunks_pattern = r"^(.*)-chunks-(\d+)\.jls$"
# sources_pattern = r"^(.*)-sources-(\d+)\.jls$"

# Group files by hostname and chunk size
for file in entries
match_chunks = match(chunks_pattern, file)
Expand Down Expand Up @@ -141,62 +141,85 @@ function generate_embeddings(
knowledge_pack_path, file)
end
end
# Process each pair of files

chunks = Vector{SubString{String}}()
sources = Vector{String}()

# Add chunks and sources to vectors from each of the scraped file

for (hostname, chunk_files) in hostname_files
for (max_chunk_size, files) in chunk_files
if haskey(files, "chunks") && haskey(files, "sources")
chunks_file = files["chunks"]
sources_file = files["sources"]
chunks = deserialize(chunks_file)
sources = deserialize(sources_file)
cost_tracker = Threads.Atomic{Float64}(0.0)
full_embeddings = RT.get_embeddings(
embedder, chunks; model, verbose = false, cost_tracker)
@info "Created embeddings for $hostname. Cost: \$$(round(cost_tracker[], digits=3))"

trunc = embedding_size < EMBEDDING_SIZE ? 1 : 0
fn_output = joinpath(knowledge_pack_path, "packs",
"$hostname-$model-$trunc-Float32__v1.0.tar.gz")
fn_temp = joinpath(knowledge_pack_path, "packs",
"$hostname-$model-$trunc-Float32__v1.0.hdf5")

h5open(fn_temp, "w") do file
file["chunks"] = chunks
file["sources"] = sources
file["embeddings"] = full_embeddings[1:embedding_size, :] |>
l2_norm_columns |> x -> map(>(0), x)
file["type"] = "ChunkIndex"

package_url_dict = Dict{String, Vector{String}}()
package_url_dict = urls_for_metadata(sources)

metadata = Dict(
:embedded_dt => Dates.today(),
:custom_metadata => custom_metadata, :max_chunk_size => max_chunk_size,
:embedding_size => embedding_size, :model => model,
:packages => package_url_dict)

metadata_json = JSON.json(metadata)
file["metadata"] = metadata_json
end

command = `tar -cvzf $fn_output -C $(dirname(fn_temp)) $(basename(fn_temp))`
run(command)
report_artifact(fn_output)
append!(chunks, deserialize(chunks_file))
append!(sources, deserialize(sources_file))

else
@warn "Missing pair for hostname: $hostname, max chunk size: $max_chunk_size"
end
end
end

# Generate embeddings
cost_tracker = Threads.Atomic{Float64}(0.0)
full_embeddings = RT.get_embeddings(
embedder, chunks; model, verbose = false, cost_tracker)

full_embeddings = full_embeddings[1:embedding_size, :] |>
l2_norm_columns

if bool_embeddings
full_embeddings = map(>(0), full_embeddings)
end

if isempty(index_name)
rand_int = rand(1000:100000)
date = Dates.today()
index_name = "$(date)-$(rand_int)"
end

@info "Created embeddings for $index_name. Cost: \$$(round(cost_tracker[], digits=3))"

trunc = embedding_size < EMBEDDING_SIZE ? 1 : 0
emb_data_type = bool_embeddings ? "Bool" : "Float32"

fn_output = joinpath(knowledge_pack_path, "packs",
"$index_name-$model-$trunc-$(emb_data_type)__v1.0.tar.gz")
fn_temp = joinpath(knowledge_pack_path, "packs",
"$index_name-$model-$trunc-$(emb_data_type)__v1.0.hdf5")

h5open(fn_temp, "w") do file
file["chunks"] = chunks
file["sources"] = sources
file["embeddings"] = full_embeddings
file["type"] = "ChunkIndex"

package_url_dict = Dict{String, Vector{String}}()
package_url_dict = urls_for_metadata(sources)

metadata = Dict(
:embedded_dt => Dates.today(),
:custom_metadata => custom_metadata, :max_chunk_size => max_chunk_size,
:embedding_size => embedding_size, :model => model,
:packages => package_url_dict)

metadata_json = JSON.json(metadata)
file["metadata"] = metadata_json
end

command = `tar -cvzf $fn_output -C $(dirname(fn_temp)) $(basename(fn_temp))`
run(command)
report_artifact(fn_output)
end

"""
make_knowledge_packs(crawlable_urls::Vector{<:AbstractString}=String[]; single_urls::Vector{<:AbstractString}=String[],
max_chunk_size::Int=MAX_CHUNK_SIZE, min_chunk_size::Int=MIN_CHUNK_SIZE, model::AbstractString=MODEL, embedding_size::Int=EMBEDDING_SIZE,
custom_metadata::AbstractString)
custom_metadata::AbstractString, bool_embeddings::Bool = true, index_name::AbstractString = "")

Entry point to crawl, parse and generate embeddings
Entry point to crawl, parse and generate embeddings.
Note: We highly recommend to pass `index_name`. This will be the name of the generated index. Default: date-randomInt

# Arguments
- crawlable_urls: URLs that should be crawled to find more links
Expand All @@ -206,11 +229,14 @@ Entry point to crawl, parse and generate embeddings
- model: Embedding model
- embedding_size: Embedding dimensions
- custom_metadata: Custom metadata like ecosystem name if required
- bool_embeddings: If true, embeddings generated will be boolean, Float32 otherwise
- index_name: Name if the index. Default: date-randomInt
"""
function make_knowledge_packs(crawlable_urls::Vector{<:AbstractString} = String[];
single_urls::Vector{<:AbstractString} = String[],
max_chunk_size::Int = MAX_CHUNK_SIZE, min_chunk_size::Int = MIN_CHUNK_SIZE,
model::AbstractString = MODEL, embedding_size::Int = EMBEDDING_SIZE, custom_metadata::AbstractString = "")
model::AbstractString = MODEL, embedding_size::Int = EMBEDDING_SIZE, custom_metadata::AbstractString = "",
bool_embeddings::Bool = true, index_name::AbstractString = "")
if isempty(crawlable_urls) && isempty(single_urls)
error("At least one of `input_urls` or `single_pages` must be provided.")
end
Expand Down Expand Up @@ -241,5 +267,7 @@ function make_knowledge_packs(crawlable_urls::Vector{<:AbstractString} = String[
make_chunks(
hostname_url_dict, knowledge_pack_path; max_chunk_size, min_chunk_size)
generate_embeddings(
knowledge_pack_path; max_chunk_size, model, embedding_size, custom_metadata)
knowledge_pack_path; max_chunk_size, model, embedding_size,
custom_metadata, bool_embeddings, index_name)

end
Loading