tags
+Extract urls inside html or xml files
# Arguments
- url: url from which all other URLs will be extracted
- url_queue: Vector in which extracted URLs will be appended
"""
function get_urls!(url::AbstractString, url_queue::Vector{<:AbstractString})
-
@info "Scraping link: $url"
- # println(url)
- # try
fetched_content = HTTP.get(url)
parsed = Gumbo.parsehtml(String(fetched_content.body))
- if (url[end-3:end] == ".xml")
+ if (url[(end - 3):end] == ".xml")
find_urls_xml!(url_xml, url_queue)
else
find_urls_html!(url, parsed.root, url_queue)
end
- # print("-------------")
- # catch e
- # println("Bad URL: $url")
- # end
end
\ No newline at end of file
diff --git a/src/make_embeddings.jl b/src/make_embeddings.jl
deleted file mode 100644
index f51c865..0000000
--- a/src/make_embeddings.jl
+++ /dev/null
@@ -1,173 +0,0 @@
-## TODO: Make a function to Check for version number
-
-"""
- report_artifact()
-
-prints artifact information
-"""
-function report_artifact(fn_output)
- @info("ARTIFACT: $(basename(fn_output))")
- @info("sha256: ", bytes2hex(open(sha256, fn_output)))
- @info("git-tree-sha1: ", Tar.tree_hash(IOBuffer(inflate_gzip(fn_output))))
-end
-
-
-
-
-"""
- create_output_folders()
-
-Creates output folders
-"""
-function create_output_folders(knowledge_pack_path::String)
- # Define the folder path
- folder_path = joinpath(knowledge_pack_path, "packs")
- println("folder_path:", folder_path)
- # Check if the folder exists
- if !isdir(folder_path)
- mkpath(folder_path)
- @info "Folder created: $folder_path"
- else
- @info "Folder already exists: $folder_path"
- end
-
-end
-
-"""
- make_chunks(hostname_url_dict::Dict{AbstractString,Vector{AbstractString}})
-
-Parses URLs from hostname_url_dict and saves the chunks
-
-# Arguments
-- hostname_url_dict: Dict with key being hostname and value being a vector of URLs
-"""
-function make_chunks(hostname_url_dict::Dict{AbstractString,Vector{AbstractString}}, knowledge_pack_path::String)
- output_chunks = Vector{SubString{String}}()
- output_sources = Vector{String}()
- SAVE_CHUNKS = true
- CHUNK_SIZE = 512
- for (hostname, urls) in hostname_url_dict
- for url in urls
- try
- chunks, sources = process_paths(url)
- append!(output_chunks, chunks)
- append!(output_sources, sources)
- catch
- @error "error!! check url: $url"
- end
- end
- if SAVE_CHUNKS
- serialize(joinpath(knowledge_pack_path, "$(hostname)-chunks-$(CHUNK_SIZE).jls"), output_chunks)
- serialize(joinpath(knowledge_pack_path, "$(hostname)-sources-$(CHUNK_SIZE).jls"), output_sources)
- end
-
- end
-
-
-end
-
-function l2_norm_columns(mat::AbstractMatrix)
- norm_ = norm.(eachcol(mat))
- return mat ./ norm_'
-end
-function l2_norm_columns(vect::AbstractVector)
- norm_ = norm(vect)
- return vect / norm_
-end
-
-
-"""
- generate_embeddings()
-
-Deserializes chunks and sources to generate embeddings
-"""
-function generate_embeddings(knowledge_pack_path::String)
- embedder = RT.BatchEmbedder()
- entries = readdir(knowledge_pack_path)
-
- # Initialize a dictionary to group files by hostname and chunk size
- hostname_files = Dict{String,Dict{Int,Dict{String,String}}}()
-
- # Regular expressions to match the file patterns
- chunks_pattern = r"^(.*)-chunks-(\d+)\.jls$"
- sources_pattern = r"^(.*)-sources-(\d+)\.jls$"
-
- # Group files by hostname and chunk size
- for file in entries
- match_chunks = match(chunks_pattern, file)
- match_sources = match(sources_pattern, file)
-
- if match_chunks !== nothing
- hostname = match_chunks.captures[1]
- chunk_size = parse(Int, match_chunks.captures[2])
- if !haskey(hostname_files, hostname)
- hostname_files[hostname] = Dict{Int,Dict{String,String}}()
- end
- if !haskey(hostname_files[hostname], chunk_size)
- hostname_files[hostname][chunk_size] = Dict{String,String}()
- end
- hostname_files[hostname][chunk_size]["chunks"] = joinpath(knowledge_pack_path, file)
- elseif match_sources !== nothing
- hostname = match_sources.captures[1]
- chunk_size = parse(Int, match_sources.captures[2])
- if !haskey(hostname_files, hostname)
- hostname_files[hostname] = Dict{Int,Dict{String,String}}()
- end
- if !haskey(hostname_files[hostname], chunk_size)
- hostname_files[hostname][chunk_size] = Dict{String,String}()
- end
- hostname_files[hostname][chunk_size]["sources"] = joinpath(knowledge_pack_path, file)
- end
- end
-
-
- # Process each pair of files
- for (hostname, chunk_files) in hostname_files
- for (chunk_size, files) in chunk_files
- if haskey(files, "chunks") && haskey(files, "sources")
- chunks_file = files["chunks"]
- sources_file = files["sources"]
- chunks = deserialize(chunks_file)
- sources = deserialize(sources_file)
- cost_tracker = Threads.Atomic{Float64}(0.0)
- full_embeddings = RT.get_embeddings(embedder, chunks; model="text-embedding-3-large", verbose=false, cost_tracker, dimensions=1024)
-
- fn_output = joinpath(knowledge_pack_path, "packs", "$hostname-textembedding3large-0-Float32__v1.0.tar.gz")
- fn_temp = joinpath(knowledge_pack_path, "packs", "pack.hdf5")
- h5open(fn_temp, "w") do file
- file["chunks"] = chunks
- file["sources"] = sources
- file["embeddings"] = full_embeddings[1:1024, :] |> l2_norm_columns |> x -> map(>(0), x)
- file["type"] = "ChunkIndex"
- # file["metadata"] = "$hostname ecosystem docstrings, chunk size $chunk_size, downloaded on 20240330, contains: Makie.jl, AlgebraOfGraphics.jl, GeoMakie.jl, GraphMakie.jl, MakieThemes.jl, TopoPlots.jl, Tyler.jl"
- end
- command = `tar -cvzf $fn_output -C $(dirname(fn_temp)) $(basename(fn_temp))`
- run(command)
- report_artifact(fn_output)
-
- else
- @warn "Missing pair for hostname: $hostname, chunk size: $chunk_size"
- end
- end
- end
-
-end
-
-
-
-"""
- make_embeddings(input_urls::Vector{<:AbstractString})
-
-Entry point to crawl, parse and create embeddings
-
-# Arguments
-- input_urls: vector containing URL strings to parse
-"""
-function make_embeddings(input_urls::Vector{<:AbstractString})
- hostname_url_dict = Dict{AbstractString,Vector{AbstractString}}()
- hostname_url_dict = crawl(input_urls)
- knowledge_pack_path = joinpath(@__DIR__, "..", "knowledge_packs")
- create_output_folders(knowledge_pack_path)
- make_chunks(hostname_url_dict, knowledge_pack_path)
- generate_embeddings(knowledge_pack_path)
-end
\ No newline at end of file
diff --git a/src/make_knowledge_packs.jl b/src/make_knowledge_packs.jl
new file mode 100644
index 0000000..291a9c7
--- /dev/null
+++ b/src/make_knowledge_packs.jl
@@ -0,0 +1,222 @@
+"""
+ report_artifact(fn_output)
+
+Print artifact information
+"""
+function report_artifact(fn_output)
+ @info("ARTIFACT: $(basename(fn_output))")
+ @info("sha256: ", bytes2hex(open(sha256, fn_output)))
+ @info("git-tree-sha1: ", Tar.tree_hash(IOBuffer(inflate_gzip(fn_output))))
+end
+
+"""
+ create_output_folders(knowledge_pack_path::String)
+
+Create output folders on the knowledge_pack_path
+"""
+function create_output_folders(knowledge_pack_path::String)
+ # Define the folder path
+ folder_path = joinpath(knowledge_pack_path, "packs")
+ # Check if the folder exists
+ if !isdir(folder_path)
+ mkpath(folder_path)
+ end
+end
+
+"""
+ make_chunks(hostname_url_dict::Dict{AbstractString,Vector{AbstractString}}, knowledge_pack_path::String; max_chunk_size::Int=MAX_CHUNK_SIZE,
+ min_chunk_size::Int=MIN_CHUNK_SIZE)
+
+Parse URLs from hostname_url_dict and save the chunks
+
+# Arguments
+- hostname_url_dict: Dict with key being hostname and value being a vector of URLs
+- knowledge_pack_path: Knowledge pack path
+- max_chunk_size: Maximum chunk size
+- min_chunk_size: Minimum chunk size
+"""
+function make_chunks(hostname_url_dict::Dict{AbstractString, Vector{AbstractString}},
+ knowledge_pack_path::String; max_chunk_size::Int = MAX_CHUNK_SIZE,
+ min_chunk_size::Int = MIN_CHUNK_SIZE)
+ SAVE_CHUNKS = true
+ for (hostname, urls) in hostname_url_dict
+ output_chunks = Vector{SubString{String}}()
+ output_sources = Vector{String}()
+ for url in urls
+ try
+ chunks, sources = process_paths(url; max_chunk_size, min_chunk_size)
+ append!(output_chunks, chunks)
+ append!(output_sources, sources)
+ catch
+ @error "error!! check url: $url"
+ end
+ end
+ if SAVE_CHUNKS
+ serialize(
+ joinpath(knowledge_pack_path,
+ "$(hostname)-chunks-max-$(max_chunk_size)-min-$(min_chunk_size).jls"),
+ output_chunks)
+ serialize(
+ joinpath(knowledge_pack_path,
+ "$(hostname)-sources-max-$(max_chunk_size)-min-$(min_chunk_size).jls"),
+ output_sources)
+ end
+ end
+end
+
+"""
+ l2_norm_columns(mat::AbstractMatrix)
+
+Normalize the columns of the input embeddings
+"""
+function l2_norm_columns(mat::AbstractMatrix)
+ norm_ = norm.(eachcol(mat))
+ return mat ./ norm_'
+end
+
+"""
+ l2_norm_columns(vect::AbstractVector)
+
+Normalize the columns of the input embeddings
+"""
+function l2_norm_columns(vect::AbstractVector)
+ norm_ = norm(vect)
+ return vect / norm_
+end
+
+"""
+ generate_embeddings(knowledge_pack_path::String; model::AbstractString=MODEL, embedding_size::Int=EMBEDDING_SIZE)
+
+Deserialize chunks and sources to generate embeddings
+
+# Arguments
+- model: Embedding model
+- embedding_size: Embedding dimensions
+"""
+function generate_embeddings(knowledge_pack_path::String; model::AbstractString = MODEL,
+ embedding_size::Int = EMBEDDING_SIZE)
+ embedder = RT.BatchEmbedder()
+ entries = readdir(knowledge_pack_path)
+ # Initialize a dictionary to group files by hostname and chunk size
+ hostname_files = Dict{String, Dict{Int, Dict{String, String}}}()
+
+ # Regular expressions to match the file patterns of chunks and sources
+ chunks_pattern = r"^(.*)-chunks-max-(\d+)-min-(\d+)\.jls$"
+ sources_pattern = r"^(.*)-sources-max-(\d+)-min-(\d+)\.jls$"
+
+ # chunks_pattern = r"^(.*)-chunks-(\d+)\.jls$"
+ # sources_pattern = r"^(.*)-sources-(\d+)\.jls$"
+
+ # Group files by hostname and chunk size
+ for file in entries
+ match_chunks = match(chunks_pattern, file)
+ match_sources = match(sources_pattern, file)
+
+ if match_chunks !== nothing
+ hostname = match_chunks.captures[1]
+ chunk_size = parse(Int, match_chunks.captures[2])
+ if !haskey(hostname_files, hostname)
+ hostname_files[hostname] = Dict{Int, Dict{String, String}}()
+ end
+ if !haskey(hostname_files[hostname], chunk_size)
+ hostname_files[hostname][chunk_size] = Dict{String, String}()
+ end
+ hostname_files[hostname][chunk_size]["chunks"] = joinpath(
+ knowledge_pack_path, file)
+ elseif match_sources !== nothing
+ hostname = match_sources.captures[1]
+ chunk_size = parse(Int, match_sources.captures[2])
+ if !haskey(hostname_files, hostname)
+ hostname_files[hostname] = Dict{Int, Dict{String, String}}()
+ end
+ if !haskey(hostname_files[hostname], chunk_size)
+ hostname_files[hostname][chunk_size] = Dict{String, String}()
+ end
+ hostname_files[hostname][chunk_size]["sources"] = joinpath(
+ knowledge_pack_path, file)
+ end
+ end
+ # Process each pair of files
+ for (hostname, chunk_files) in hostname_files
+ for (chunk_size, files) in chunk_files
+ if haskey(files, "chunks") && haskey(files, "sources")
+ chunks_file = files["chunks"]
+ sources_file = files["sources"]
+ chunks = deserialize(chunks_file)
+ sources = deserialize(sources_file)
+ cost_tracker = Threads.Atomic{Float64}(0.0)
+ full_embeddings = RT.get_embeddings(
+ embedder, chunks; model, verbose = false, cost_tracker)
+ @info "Created embeddings for $hostname. Cost: \$$(round(cost_tracker[], digits=3))"
+ fn_output = joinpath(knowledge_pack_path, "packs",
+ "$hostname-textembedding3large-0-Float32__v1.0.tar.gz")
+ fn_temp = joinpath(knowledge_pack_path, "packs",
+ "$hostname-textembedding3large-0-Float32__v1.0.hdf5")
+ h5open(fn_temp, "w") do file
+ file["chunks"] = chunks
+ file["sources"] = sources
+ file["embeddings"] = full_embeddings[1:embedding_size, :] |>
+ l2_norm_columns |> x -> map(>(0), x)
+ file["type"] = "ChunkIndex"
+ # file["metadata"] = "$hostname ecosystem docstrings, chunk size $chunk_size, downloaded on 20240330, contains: Makie.jl, AlgebraOfGraphics.jl, GeoMakie.jl, GraphMakie.jl, MakieThemes.jl, TopoPlots.jl, Tyler.jl"
+ end
+
+ command = `tar -cvzf $fn_output -C $(dirname(fn_temp)) $(basename(fn_temp))`
+ run(command)
+ report_artifact(fn_output)
+
+ else
+ @warn "Missing pair for hostname: $hostname, chunk size: $chunk_size"
+ end
+ end
+ end
+end
+
+"""
+ make_knowledge_packs(crawlable_urls::Vector{<:AbstractString}=String[]; single_urls::Vector{<:AbstractString}=String[],
+ max_chunk_size::Int=MAX_CHUNK_SIZE, min_chunk_size::Int=MIN_CHUNK_SIZE, model::AbstractString=MODEL, embedding_size::Int=EMBEDDING_SIZE)
+
+Entry point to crawl, parse and generate embeddings
+
+# Arguments
+- crawlable_urls: URLs that should be crawled to find more links
+- single_urls: Single page URLs that should just be scraped and parsed. The crawler won't look for more URLs
+- max_chunk_size: Maximum chunk size
+- min_chunk_size: Minimum chunk size
+- model: Embedding model
+- embedding_size: Embedding dimensions
+"""
+function make_knowledge_packs(crawlable_urls::Vector{<:AbstractString} = String[];
+ single_urls::Vector{<:AbstractString} = String[],
+ max_chunk_size::Int = MAX_CHUNK_SIZE, min_chunk_size::Int = MIN_CHUNK_SIZE,
+ model::AbstractString = MODEL, embedding_size::Int = EMBEDDING_SIZE)
+ if isempty(crawlable_urls) && isempty(single_urls)
+ error("At least one of `input_urls` or `single_pages` must be provided.")
+ end
+
+ hostname_url_dict = Dict{AbstractString, Vector{AbstractString}}()
+
+ if !isempty(crawlable_urls)
+ hostname_url_dict, visited_url_set = crawl(crawlable_urls)
+ else
+ visited_url_set = Set{AbstractString}()
+ end
+ for url in single_urls
+ base_url = get_base_url(url)
+ if !in(base_url, visited_url_set)
+ push!(visited_url_set, base_url)
+ crawlable, sitemap_urls = check_robots_txt("*", base_url)
+ if crawlable
+ try
+ process_hostname!(url, hostname_url_dict)
+ catch
+ @error "Bad URL: $base_url"
+ end
+ end
+ end
+ end
+ knowledge_pack_path = joinpath(@__DIR__, "..", "knowledge_packs")
+ create_output_folders(knowledge_pack_path)
+ make_chunks(hostname_url_dict, knowledge_pack_path; max_chunk_size, min_chunk_size)
+ generate_embeddings(knowledge_pack_path; model, embedding_size)
+end
diff --git a/src/parser.jl b/src/parser.jl
index d909280..def1a17 100644
--- a/src/parser.jl
+++ b/src/parser.jl
@@ -1,21 +1,3 @@
-"""
-Working:
-
-Since HTML structure is complex, we need to figure out when do we insert the extracted text in parsed_blocks
-ie., should we add the text of child hierarchy and then insert or should we insert now and let the child hierarchy make another insertion.
-For this we employ multiple checks. If the current node is heading, directly insert into parsed_blocks.
-If the current node is a code block, return the text inside code block with backticks.
-If the node is neither heading nor code, then we'll need to go deeper in the hierarchy.
-if the current node's tag is from the list [:p, :li, :dt, :dd, :pre, :b, :strong, :i, :cite, :address, :em, :td]
-it is assumed that everything inside the tag is part of a single text block with inline code.
-But when we go deeper and if there is a code block with size > 50 chars, then our assumption was false.
-To correct this, we first insert the previously extracted text, next we insert the current code and additionally indicate the parent recursion iteration
-that the current iteration has inserted the previously parsed text, so there is no need for parent iteration to insert the text block again.
-We indicate this by a return flag is_text_inserted
-"""
-
-
-
"""
insert_parsed_data!(heading_hierarchy::Dict{Symbol,Any},
parsed_blocks::Vector{Dict{String,Any}},
@@ -30,11 +12,10 @@ Insert the text into parsed_blocks Vector
- text_to_insert: Text to be inserted
- text_type: The text to be inserted could be heading or a code block or just text
"""
-function insert_parsed_data!(heading_hierarchy::Dict{Symbol,Any},
- parsed_blocks::Vector{Dict{String,Any}},
- text_to_insert::AbstractString,
- text_type::AbstractString)
-
+function insert_parsed_data!(heading_hierarchy::Dict{Symbol, Any},
+ parsed_blocks::Vector{Dict{String, Any}},
+ text_to_insert::AbstractString,
+ text_type::AbstractString)
if !isempty(strip(text_to_insert))
push!(parsed_blocks,
Dict(text_type => strip(text_to_insert),
@@ -42,8 +23,6 @@ function insert_parsed_data!(heading_hierarchy::Dict{Symbol,Any},
end
end
-
-
"""
process_headings!(node::Gumbo.HTMLElement,
heading_hierarchy::Dict{Symbol,Any},
@@ -57,13 +36,13 @@ Process headings. If the current node is heading, directly insert into parsed_bl
- parsed_blocks: Vector of Dicts to store parsed text and metadata
"""
function process_headings!(node::Gumbo.HTMLElement,
- heading_hierarchy::Dict{Symbol,Any},
- parsed_blocks::Vector{Dict{String,Any}})
-
+ heading_hierarchy::Dict{Symbol, Any},
+ parsed_blocks::Vector{Dict{String, Any}})
tag_name = Gumbo.tag(node)
# Clear headings of equal or lower level
for k in collect(keys(heading_hierarchy))
- if k != "header" && Base.parse(Int, last(string(k))) >= Base.parse(Int, last(string(tag_name)))
+ if k != "header" &&
+ Base.parse(Int, last(string(k))) >= Base.parse(Int, last(string(tag_name)))
delete!(heading_hierarchy, k)
end
end
@@ -123,11 +102,10 @@ If the node is neither heading nor code
- prev_text_buffer: IO Buffer which contains previous text
"""
function process_generic_node!(node::Gumbo.HTMLElement,
- heading_hierarchy::Dict{Symbol,Any},
- parsed_blocks::Vector{Dict{String,Any}},
- child_new::Bool=true,
- prev_text_buffer::IO=IOBuffer(write=true))
-
+ heading_hierarchy::Dict{Symbol, Any},
+ parsed_blocks::Vector{Dict{String, Any}},
+ child_new::Bool = true,
+ prev_text_buffer::IO = IOBuffer(write = true))
seekstart(prev_text_buffer)
prev_text = read(prev_text_buffer, String)
@@ -142,10 +120,15 @@ function process_generic_node!(node::Gumbo.HTMLElement,
# if the current tag belongs in the list, it is assumed that all the text/code should be part of a single paragraph/block, unless,
# there occurs a code block with >50 chars, then, previously parsed text is inserted first, then the code block is inserted.
- if tag_name in [:p, :li, :dt, :dd, :pre, :b, :strong, :i, :cite, :address, :em, :td, :a, :span, :header]
- received_text, is_code_block, is_text_inserted = process_node!(child, heading_hierarchy, parsed_blocks, false, prev_text_buffer)
+ if tag_name in [:p, :li, :dt, :dd, :pre, :b, :strong, :i,
+ :cite, :address, :em, :td, :a, :span, :header]
+ received_text, is_code_block, is_text_inserted = process_node!(
+ child, heading_hierarchy, parsed_blocks, false, prev_text_buffer)
+ elseif tag_name in [:script]
+ continue
else
- received_text, is_code_block, is_text_inserted = process_node!(child, heading_hierarchy, parsed_blocks, child_new, prev_text_buffer)
+ received_text, is_code_block, is_text_inserted = process_node!(
+ child, heading_hierarchy, parsed_blocks, child_new, prev_text_buffer)
end
# changing text_to_insert to "" to avoid inserting text_to_insert again (as it was inserted by the child recursion call)
@@ -180,7 +163,6 @@ function process_generic_node!(node::Gumbo.HTMLElement,
print(prev_text_buffer, " " * received_text)
text_to_insert = text_to_insert * " " * received_text
end
-
end
# if child_new is false, this means new child (new entry in parsed_blocks) should not be created, hence,
@@ -195,7 +177,8 @@ function process_generic_node!(node::Gumbo.HTMLElement,
# if we're insert text in current node level, then we should insert the previous text if available,
# otherwise it'll be inserted when the control goes back to the parent call and hence, order of the insertion will be weird
if !isempty(strip(text_to_insert))
- insert_parsed_data!(heading_hierarchy, parsed_blocks, String(take!(prev_text_buffer)), "text")
+ insert_parsed_data!(
+ heading_hierarchy, parsed_blocks, String(take!(prev_text_buffer)), "text")
is_text_inserted = true
end
@@ -205,7 +188,6 @@ function process_generic_node!(node::Gumbo.HTMLElement,
return "", is_code_block, is_text_inserted
end
-
"""
process_docstring!(node::Gumbo.HTMLElement,
heading_hierarchy::Dict{Symbol,Any},
@@ -224,11 +206,10 @@ Function to process node of class `docstring`
- prev_text_buffer: IO Buffer which contains previous text
"""
function process_docstring!(node::Gumbo.HTMLElement,
- heading_hierarchy::Dict{Symbol,Any},
- parsed_blocks::Vector{Dict{String,Any}},
- child_new::Bool=true,
- prev_text_buffer::IO=IOBuffer(write=true))
-
+ heading_hierarchy::Dict{Symbol, Any},
+ parsed_blocks::Vector{Dict{String, Any}},
+ child_new::Bool = true,
+ prev_text_buffer::IO = IOBuffer(write = true))
seekstart(prev_text_buffer)
prev_text = read(prev_text_buffer, String)
is_code_block = false
@@ -248,10 +229,12 @@ function process_docstring!(node::Gumbo.HTMLElement,
# Insert "header"
if Gumbo.tag(children[1]) == :header
heading_hierarchy[:docstring_header] = strip(Gumbo.text(children[1]))
- insert_parsed_data!(heading_hierarchy, parsed_blocks, Gumbo.text(children[1]), "docstring_header")
+ insert_parsed_data!(
+ heading_hierarchy, parsed_blocks, Gumbo.text(children[1]), "docstring_header")
end
- received_text, is_code_block, is_text_inserted = process_node!(children[2], heading_hierarchy, parsed_blocks, child_new, prev_text_buffer)
+ received_text, is_code_block, is_text_inserted = process_node!(
+ children[2], heading_hierarchy, parsed_blocks, child_new, prev_text_buffer)
if !isempty(strip(received_text))
insert_parsed_data!(heading_hierarchy, parsed_blocks, received_text, "text")
@@ -279,11 +262,10 @@ Function to process a node
- prev_text_buffer: IO Buffer which contains previous text
"""
function process_node!(node::Gumbo.HTMLElement,
- heading_hierarchy::Dict{Symbol,Any},
- parsed_blocks::Vector{Dict{String,Any}},
- child_new::Bool=true,
- prev_text_buffer::IO=IOBuffer(write=true))
-
+ heading_hierarchy::Dict{Symbol, Any},
+ parsed_blocks::Vector{Dict{String, Any}},
+ child_new::Bool = true,
+ prev_text_buffer::IO = IOBuffer(write = true))
tag_name = Gumbo.tag(node)
if startswith(string(tag_name), "h") && isdigit(last(string(tag_name)))
return process_headings!(node, heading_hierarchy, parsed_blocks)
@@ -292,15 +274,14 @@ function process_node!(node::Gumbo.HTMLElement,
return process_code(node)
elseif tag_name == :article && getattr(node, "class", "") == "docstring"
- return process_docstring!(node, heading_hierarchy, parsed_blocks, child_new, prev_text_buffer)
-
+ return process_docstring!(
+ node, heading_hierarchy, parsed_blocks, child_new, prev_text_buffer)
end
- return process_generic_node!(node, heading_hierarchy, parsed_blocks, child_new, prev_text_buffer)
-
+ return process_generic_node!(
+ node, heading_hierarchy, parsed_blocks, child_new, prev_text_buffer)
end
-
"""
multiple dispatch for process_node!() when node is of type Gumbo.HTMLText
"""
@@ -310,14 +291,10 @@ function process_node!(node::Gumbo.HTMLText, args...)
return strip(Gumbo.text(node)), is_code_block, is_text_inserted
end
-
"""
get_base_url(url::AbstractString)
-Extracts the base url.
-
-# Arguments
-- `url`: The url string of which, the base url needs to be extracted
+Extract the base url.
"""
function get_base_url(url::AbstractString)
parsed_url = URIs.URI(url)
@@ -329,7 +306,7 @@ end
"""
get_html_content(root::Gumbo.HTMLElement)
-Returns the main content of the HTML. If not found, returns the whole HTML to parse
+Return the main content of the HTML. If not found, return the whole HTML to parse
# Arguments
- `root`: The HTML root from which content is extracted
@@ -338,73 +315,34 @@ function get_html_content(root::Gumbo.HTMLElement)
target_ids = Set(["VPContent", "main_content_wrap", "pages-content"])
target_classes = Set(["content", "franklin-content"])
- content_candidates = [el for el in AbstractTrees.PreOrderDFS(root) if el isa HTMLElement]
+ content_candidates = [el
+ for el in AbstractTrees.PreOrderDFS(root) if el isa HTMLElement]
# First try to find by ID
- content_by_id = filter(el -> getattr(el, "id", nothing) in target_ids, content_candidates)
+ content_by_id = filter(
+ el -> getattr(el, "id", nothing) in target_ids, content_candidates)
if !isempty(content_by_id)
return only(content_by_id)
end
# Fallback to class if no ID matches
- content_by_class = filter(el -> getattr(el, "class", nothing) in target_classes, content_candidates)
+ content_by_class = filter(
+ el -> getattr(el, "class", nothing) in target_classes, content_candidates)
if !isempty(content_by_class)
return only(content_by_class)
end
# Fallback to the root node if no class matches
return root
-
end
-
"""
parse_url(url::AbstractString)
-Initiator and main function to parse HTML from url
+Initiator and main function to parse HTML from url. Return a Vector of Dict containing Heading/Text/Code along with a Dict of respective metadata
# Arguments
- `url`: URL string to parse
-
-# Returns
-- A Vector of Dict containing Heading/Text/Code along with a Dict of respective metadata
-
-# Usage
-parsed_blocks = parse_url("https://docs.julialang.org/en/v1/base/multi-threading/")
-
-# Example
-Let the HTML be:
-
-
-
-
- Heading 1
- Heading 2
- para 1
- Heading 3
- this is my code block
- This is another h3 under Heading 2
- This is a paragraph with inline code
-
- Heading 2_2
- para ewg
-
-
-
-
-Output:
-Any[
- Dict{String, Any}("URL" => "URL")
- Dict{String, Any}("metadata" => Dict{Any, Any}("h1" => "Heading 1"), "heading" => "Heading 1")
- Dict{String, Any}("metadata" => Dict{Any, Any}("h1" => "Heading 1", "h2" => "Heading 2"), "heading" => "Heading 2")
- Dict{String, Any}("metadata" => Dict{Any, Any}("h1" => "Heading 1", "h2" => "Heading 2"), "text" => "para 1")
- Dict{String, Any}("metadata" => Dict{Any, Any}("h1" => "Heading 1", "h3" => "Heading 3", "h2" => "Heading 2"), "heading" => "Heading 3")
- Dict{String, Any}("code" => "```julia this is my code block```", "metadata" => Dict{Any, Any}("h1" => "Heading 1", "h3" => "Heading 3", "h2" => "Heading 2"))
- Dict{String, Any}("metadata" => Dict{Any, Any}("h1" => "Heading 1", "h3" => "This is another h3 under Heading 2", "h2" => "Heading 2"), "heading" => "This is another h3 under Heading 2")
- Dict{String, Any}("metadata" => Dict{Any, Any}("h1" => "Heading 1", "h3" => "This is another h3 under Heading 2", "h2" => "Heading 2"), "text" => "This is a paragraph with inline code")
- Dict{String, Any}("metadata" => Dict{Any, Any}("h1" => "Heading 1", "h2" => "Heading 2_2"), "heading" => "Heading 2_2")
- Dict{String, Any}("metadata" => Dict{Any, Any}("h1" => "Heading 1", "h2" => "Heading 2_2"), "text" => "para ewg")
-]
"""
function parse_url_to_blocks(url::AbstractString)
@@ -419,8 +357,8 @@ function parse_url_to_blocks(url::AbstractString)
# title = [el
# for el in AbstractTrees.PreOrderDFS(r_parsed.root)
# if el isa HTMLElement && tag(el) == :title] .|> text |> Base.Fix2(join, " / ")
- parsed_blocks = Vector{Dict{String,Any}}([Dict("Source" => base_url)])
- heading_hierarchy = Dict{Symbol,Any}()
+ parsed_blocks = Vector{Dict{String, Any}}([Dict("Source" => base_url)])
+ heading_hierarchy = Dict{Symbol, Any}()
process_node!(get_html_content(parsed.root), heading_hierarchy, parsed_blocks)
return parsed_blocks
catch
diff --git a/src/preparation.jl b/src/preparation.jl
index ab8d7b5..9979155 100644
--- a/src/preparation.jl
+++ b/src/preparation.jl
@@ -1,9 +1,7 @@
-# include("recursive_splitter.jl")
-include("utils.jl")
"""
get_header_path(d::Dict)
-Concatenates the h1, h2, h3 keys from the metadata of a Dict
+Concatenate the h1, h2, h3 keys from the metadata of a Dict
# Examples
```julia
@@ -12,7 +10,7 @@ get_header_path(d)
# Output: "Axis/Attributes/yzoomkey"
```
"""
-function get_header_path(d::Dict)
+function get_header_path(d::Dict{String,Any})
metadata = get(d, "metadata", Dict{Any,Any}())
isempty(metadata) && return nothing
keys_ = [:h1, :h2, :h3]
@@ -21,8 +19,13 @@ function get_header_path(d::Dict)
end
-"Roll-up chunks (that have the same header!), so we can split them later by to get the desired length"
-function roll_up_chunks(parsed_blocks, url::AbstractString; separator::String="")
+
+"""
+ roll_up_chunks(parsed_blocks::Vector{Dict{String,Any}}, url::AbstractString; separator::String="")
+
+Roll-up chunks (that have the same header!), so we can split them later by to get the desired length
+"""
+function roll_up_chunks(parsed_blocks::Vector{Dict{String,Any}}, url::AbstractString; separator::String="")
docs = String[]
io = IOBuffer()
last_header = nothing
@@ -35,7 +38,7 @@ function roll_up_chunks(parsed_blocks, url::AbstractString; separator::String="<
str = String(take!(io))
if !isempty(str)
push!(docs, str)
- src = url * (isnothing(last_header) ? "" : "::$last_header")
+ src = url * (isnothing(last_header) ? "" : " - $last_header")
push!(sources, src)
end
last_header = header
@@ -48,7 +51,7 @@ function roll_up_chunks(parsed_blocks, url::AbstractString; separator::String="<
str = String(take!(io))
if !isempty(str)
push!(docs, str)
- src = url * (isnothing(last_header) ? "" : "::$last_header")
+ src = url * (isnothing(last_header) ? "" : " - $last_header")
push!(sources, src)
end
return docs, sources
@@ -56,19 +59,23 @@ end
struct DocParserChunker <: RT.AbstractChunker end
-"""
- RT.get_chunks(chunker::DocParserChunker,
- html_files::Vector{<:AbstractString};
- sources::AbstractVector{<:AbstractString}=html_files,
- verbose::Bool=true,
- separators=["\n\n", ". ", "\n", " "], max_length::Int=256)
-Extracts chunks from HTML files, by parsing the content in the HTML, rolling up chunks by headers, and splits them by separators to get the desired length.
+"""
+ RT.get_chunks(chunker::DocParserChunker, url::AbstractString;
+ verbose::Bool=true, separators=["\n\n", ". ", "\n", " "], max_chunk_size::Int=MAX_CHUNK_SIZE)
+
+Extract chunks from HTML files, by parsing the content in the HTML, rolling up chunks by headers,
+and splits them by separators to get the desired length.
+
+# Arguments
+- chunker: DocParserChunker
+- url: URL of the webpage to extract chunks
+- verbose: Bool to print the log
+- separators: Chunk separators
+- max_chunk_size Maximum chunk size
"""
function RT.get_chunks(chunker::DocParserChunker, url::AbstractString;
- verbose::Bool=true,
- separators=["\n\n", ". ", "\n", " "], max_length::Int=256)
-
+ verbose::Bool=true, separators=["\n\n", ". ", "\n", " "], max_chunk_size::Int=MAX_CHUNK_SIZE)
SEP = ""
sources = AbstractVector{<:AbstractString}
@@ -84,8 +91,9 @@ function RT.get_chunks(chunker::DocParserChunker, url::AbstractString;
## roll up chunks by SEP splitter, then remove it later
for (doc, src) in zip(docs_, sources_)
## roll up chunks by SEP splitter, then remove it later
- doc_chunks = PT.recursive_splitter(doc, [SEP, separators...]; max_length) .|>
+ doc_chunks = PT.recursive_splitter(doc, [SEP, separators...]; max_length=max_chunk_size) .|>
x -> replace(x, SEP => " ") .|> strip |> x -> filter(!isempty, x)
+ chunk_lengths = length.(doc_chunks)
# skip if no chunks found
isempty(doc_chunks) && continue
append!(output_chunks, doc_chunks)
@@ -96,20 +104,24 @@ end
-"Process folders provided in `paths`. In each, take all HTML files, scrape them, chunk them and postprocess them."
-function process_paths(url::AbstractString, max_length::Int=512)
+"""
+ process_paths(url::AbstractString; max_chunk_size::Int=MAX_CHUNK_SIZE, min_chunk_size::Int=MIN_CHUNK_SIZE)
+
+Process folders provided in `paths`. In each, take all HTML files, scrape them, chunk them and postprocess them.
+"""
+function process_paths(url::AbstractString; max_chunk_size::Int=MAX_CHUNK_SIZE, min_chunk_size::Int=MIN_CHUNK_SIZE)
output_chunks = Vector{SubString{String}}()
output_sources = Vector{String}()
- chunks, sources = RT.get_chunks(DocParserChunker(), url; max_length)
+ chunks, sources = RT.get_chunks(DocParserChunker(), url; max_chunk_size)
append!(output_chunks, chunks)
append!(output_sources, sources)
@info "Scraping done: $(length(output_chunks)) chunks"
- postprocess_chunks(output_chunks, output_sources; min_length=40, skip_code=true)
+ output_chunks, output_sources = postprocess_chunks(output_chunks, output_sources; min_chunk_size, skip_code=true)
return output_chunks, output_sources
end
diff --git a/src/user_preferences.jl b/src/user_preferences.jl
new file mode 100644
index 0000000..98794c6
--- /dev/null
+++ b/src/user_preferences.jl
@@ -0,0 +1,4 @@
+global MIN_CHUNK_SIZE = 40
+global MAX_CHUNK_SIZE = 256
+global MODEL = "text-embedding-3-large"
+global EMBEDDING_SIZE = 1024
\ No newline at end of file
diff --git a/src/utils.jl b/src/utils.jl
index 4bf1e07..e8dc014 100644
--- a/src/utils.jl
+++ b/src/utils.jl
@@ -1,4 +1,9 @@
-"Finds duplicates in a list of chunks using SHA-256 hash. Returns a bit vector of the same length as the input list, where `true` indicates a duplicate (second instance of the same text)."
+"""
+ find_duplicates(chunks::AbstractVector{<:AbstractString})
+
+Find duplicates in a list of chunks using SHA-256 hash. Returns a bit vector of the same length as the input list,
+where `true` indicates a duplicate (second instance of the same text).
+"""
function find_duplicates(chunks::AbstractVector{<:AbstractString})
# hash the chunks for easier search
hashed_chunks = bytes2hex.(sha256.(chunks))
@@ -20,20 +25,34 @@ function find_duplicates(chunks::AbstractVector{<:AbstractString})
return duplicates
end
-"Removes chunks that are duplicated in the input list of chunks and their corresponding sources."
+"""
+ remove_duplicates(chunks::AbstractVector{<:AbstractString}, sources::AbstractVector{<:AbstractString})
+
+Remove chunks that are duplicated in the input list of chunks and their corresponding sources.
+"""
function remove_duplicates(chunks::AbstractVector{<:AbstractString}, sources::AbstractVector{<:AbstractString})
idxs = find_duplicates(chunks)
return chunks[.!idxs], sources[.!idxs]
end
-"Removes chunks that are shorter than a specified length (`min_length`) from the input list of chunks and their corresponding sources."
-function remove_short_chunks(chunks::AbstractVector{<:AbstractString}, sources::AbstractVector{<:AbstractString}; min_length::Int=40, skip_code::Bool=true)
+
+"""
+ remove_short_chunks(chunks::AbstractVector{<:AbstractString}, sources::AbstractVector{<:AbstractString};
+ min_chunk_size::Int=MIN_CHUNK_SIZE, skip_code::Bool=true)
+
+Remove chunks that are shorter than a specified length (`min_length`) from the input list of chunks and their corresponding sources.
+"""
+function remove_short_chunks(chunks::AbstractVector{<:AbstractString}, sources::AbstractVector{<:AbstractString};
+ min_chunk_size::Int=MIN_CHUNK_SIZE, skip_code::Bool=true)
+
+ chunk_lengths = length.(chunks)
idx = if skip_code
- ## Keep short chunks if they contain code (might be combined with some preceding/suceeeding text)
- findall(x -> length(x) >= min_length || occursin("```", x), chunks)
+ ## Keep short chunks if they contain code (might be combined with some preceding/succeeding text)
+ findall(x -> length(x) >= min_chunk_size || occursin("```", x), chunks)
else
- findall(x -> length(x) >= min_length, chunks)
+ findall(x -> length(x) >= min_chunk_size, chunks)
end
+ chunk_lengths = length.(chunks[idx])
return chunks[idx], sources[idx]
end
@@ -42,14 +61,24 @@ function replace_local_paths(sources::AbstractVector{<:AbstractString}, paths::A
@assert length(paths) == length(websites) "Length of `paths` must match length of `websites`"
replacement_pairs = paths .=> websites
output = map(x -> replace(x, replacement_pairs...), sources)
+ return output
end
-"Post-processes the input list of chunks and their corresponding sources by removing short chunks and duplicates."
-function postprocess_chunks(chunks::AbstractVector{<:AbstractString}, sources::AbstractVector{<:AbstractString}; min_length::Int=40, skip_code::Bool=true,
- paths::Union{Nothing,AbstractVector{<:AbstractString}}=nothing, websites::Union{Nothing,AbstractVector{<:AbstractString}}=nothing)
+
+
+"""
+ function postprocess_chunks(chunks::AbstractVector{<:AbstractString}, sources::AbstractVector{<:AbstractString};
+ min_chunk_size::Int=MIN_CHUNK_SIZE, skip_code::Bool=true, paths::Union{Nothing,AbstractVector{<:AbstractString}}=nothing,
+ websites::Union{Nothing,AbstractVector{<:AbstractString}}=nothing)
+
+Post-process the input list of chunks and their corresponding sources by removing short chunks and duplicates.
+"""
+function postprocess_chunks(chunks::AbstractVector{<:AbstractString}, sources::AbstractVector{<:AbstractString};
+ min_chunk_size::Int=MIN_CHUNK_SIZE, skip_code::Bool=true, paths::Union{Nothing,AbstractVector{<:AbstractString}}=nothing,
+ websites::Union{Nothing,AbstractVector{<:AbstractString}}=nothing)
len_ = length(chunks)
- chunks, sources = remove_short_chunks(chunks, sources; min_length, skip_code)
+ chunks, sources = remove_short_chunks(chunks, sources; min_chunk_size, skip_code)
@info "Removed $(len_ - length(chunks)) short chunks"
len_ = length(chunks)
@@ -63,6 +92,31 @@ function postprocess_chunks(chunks::AbstractVector{<:AbstractString}, sources::A
end
return chunks, sources
+end
+
+"""
+ function remove_urls_from_index(index_path::AbstractString, prefix_urls=Vector{<:AbstractString})
+
+Remove chunks and sources corresponding to URLs starting with `prefix_urls`
+"""
+function remove_urls_from_index(index_path::AbstractString, prefix_urls=Vector{<:AbstractString})
+ @assert endswith(file_path, ".hdf5") "Provided file path must end with `.hdf5` (see HDF5.jl)."
+ h5open(index_path, "r+") do orig_file
+ # Load the sources dataset into a Julia array
+ sources = read(orig_file["sources"])
+ chunks = read(orig_file["chunks"])
+ embeddings = read(orig_file["embeddings"])
+ for url_to_remove in prefix_urls
+ indices_to_remove = findall(x -> startswith(x, url_to_remove), sources)
+ sources = deleteat!(sources, indices_to_remove)
+ chunks = deleteat!(chunks, indices_to_remove)
+ embeddings = embeddings[:, setdiff(1:size(embeddings, 2), indices_to_remove)]
+ end
+
+ write(file["sources"], sources)
+ write(file["chunks"], chunks)
+ write(file["embeddings"], embeddings)
+ end
end
\ No newline at end of file
diff --git a/test/runtests.jl b/test/runtests.jl
index 78a78b4..4b4a92c 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -10,21 +10,22 @@ using LinearAlgebra, Unicode, SparseArrays
using HDF5
using Tar
using Inflate
-
using SHA
using Serialization, URIs
-include("..\\src\\crawl.jl")
-include("..\\src\\extract_urls.jl")
-include("..\\src\\parser.jl")
-include("..\\src\\preparation.jl")
+include(joinpath("..", "src", "crawl.jl"))
+include(joinpath("..", "src", "extract_urls.jl"))
+include(joinpath("..", "src", "parser.jl"))
+include(joinpath("..", "src", "preparation.jl"))
+include(joinpath("..", "src", "user_preferences.jl"))
+include(joinpath("..", "src", "utils.jl"))
+
urls = Vector{AbstractString}(["https://docs.julialang.org/en/v1/"])
url = urls[1]
queue = Vector{AbstractString}()
-@testset "check robots.txt" begin
+@testset "HTTP" begin
@test HTTP.get(url) != nothing
-
result, sitemap_queue = check_robots_txt("*", url)
@test result == true
end
@@ -38,12 +39,13 @@ end
parsed_blocks = parse_url_to_blocks(url)
@test length(parsed_blocks) > 0
SEP = ""
- docs_, sources_ = roll_up_chunks(parsed_blocks, url; separator=SEP)
- @test length(docs_) > 0 && length(sources_) > 0 && docs_[1] != nothing && sources_[1] != nothing
+ docs_, sources_ = roll_up_chunks(parsed_blocks, url; separator = SEP)
+ @test length(docs_) > 0 && length(sources_) > 0 && docs_[1] != nothing &&
+ sources_[1] != nothing
end
@testset "overall test" begin
chunks, sources = process_paths(url)
- @test length(chunks) > 0 && length(sources) > 0 && chunks[1] != nothing && sources[1] != nothing
-
+ @test length(chunks) > 0 && length(sources) > 0 && chunks[1] != nothing &&
+ sources[1] != nothing
end
From 0782e01ba709f18bda1cca77cb83e8b73922630e Mon Sep 17 00:00:00 2001
From: Shreyas Agrawal <48771895+splendidbug@users.noreply.github.com>
Date: Sun, 11 Aug 2024 03:19:20 -0700
Subject: [PATCH 3/7] dependency changes
---
.github/workflows/CI.yml | 1 -
Project.toml | 4 ++++
src/DocsScraper.jl | 3 ---
3 files changed, 4 insertions(+), 4 deletions(-)
diff --git a/.github/workflows/CI.yml b/.github/workflows/CI.yml
index 874943f..0b6af25 100644
--- a/.github/workflows/CI.yml
+++ b/.github/workflows/CI.yml
@@ -24,7 +24,6 @@ jobs:
matrix:
version:
- "1.10"
- - "nightly"
os:
- ubuntu-latest
arch:
diff --git a/Project.toml b/Project.toml
index 705a918..bc05f3f 100644
--- a/Project.toml
+++ b/Project.toml
@@ -14,6 +14,8 @@ PromptingTools = "670122d1-24a8-4d70-bfce-740807c42192"
Tar = "a4e569a6-e804-4fa4-b0f3-eef7a1d5b13e"
URIParser = "30578b45-9adc-5946-b283-645ec420af67"
URIs = "5c2747f8-b7ea-4ff2-ba2e-563bfd36b1d4"
+LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
+SparseArrays = "2f01184e-e22b-5df5-ae63-d93ebab69eaf"
[compat]
AbstractTrees = "0.4.5"
@@ -26,6 +28,8 @@ PromptingTools = "0.36.0"
URIParser = "0.4.1"
URIs = "1.5.1"
Tar = "1.10.0"
+LinearAlgebra = "<0.0.1, 1"
+SparseArrays = "<0.0.1, 1"
[extras]
Aqua = "4c88cf16-eb10-579e-8560-4a9242c79595"
diff --git a/src/DocsScraper.jl b/src/DocsScraper.jl
index e78dde7..40bc3ee 100644
--- a/src/DocsScraper.jl
+++ b/src/DocsScraper.jl
@@ -2,8 +2,6 @@ module DocsScraper
using HTTP, Gumbo, AbstractTrees, URIs
using Gumbo: HTMLDocument, HTMLElement
using EzXML
-using Pkg
-Pkg.develop(PackageSpec(path="C:\\Users\\shrey\\Desktop\\stuff\\assignments\\grad\\projects\\Julia\\PromptingTools.jl"))
using PromptingTools
const PT = PromptingTools
const RT = PromptingTools.Experimental.RAGTools
@@ -27,5 +25,4 @@ include("user_preferences.jl")
include("utils.jl")
export remove_urls_from_index
-
end
\ No newline at end of file
From 52998a90e7f8373d879537e8242309a93c227a39 Mon Sep 17 00:00:00 2001
From: Shreyas Agrawal <48771895+splendidbug@users.noreply.github.com>
Date: Sun, 11 Aug 2024 03:30:37 -0700
Subject: [PATCH 4/7] dependency changes
---
.github/workflows/CI.yml | 62 ++++++++++++++++++++--------------------
Project.toml | 15 ++++++----
2 files changed, 41 insertions(+), 36 deletions(-)
diff --git a/.github/workflows/CI.yml b/.github/workflows/CI.yml
index 0b6af25..1c00a7a 100644
--- a/.github/workflows/CI.yml
+++ b/.github/workflows/CI.yml
@@ -43,34 +43,34 @@ jobs:
files: lcov.info
token: ${{ secrets.CODECOV_TOKEN }}
fail_ci_if_error: false
- docs:
- name: Documentation
- runs-on: ubuntu-latest
- permissions:
- actions: write # needed to allow julia-actions/cache to proactively delete old caches that it has created
- contents: write
- statuses: write
- steps:
- - uses: actions/checkout@v4
- - uses: julia-actions/setup-julia@v2
- with:
- version: "1"
- - uses: julia-actions/cache@v2
- - name: Configure doc environment
- shell: julia --project=docs --color=yes {0}
- run: |
- using Pkg
- Pkg.develop(PackageSpec(path=pwd()))
- Pkg.instantiate()
- - uses: julia-actions/julia-buildpkg@v1
- - uses: julia-actions/julia-docdeploy@v1
- env:
- GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
- DOCUMENTER_KEY: ${{ secrets.DOCUMENTER_KEY }}
- - name: Run doctests
- shell: julia --project=docs --color=yes {0}
- run: |
- using Documenter: DocMeta, doctest
- using DocsScraper
- DocMeta.setdocmeta!(DocsScraper, :DocTestSetup, :(using DocsScraper); recursive=true)
- doctest(DocsScraper)
+ # docs:
+ # name: Documentation
+ # runs-on: ubuntu-latest
+ # permissions:
+ # actions: write # needed to allow julia-actions/cache to proactively delete old caches that it has created
+ # contents: write
+ # statuses: write
+ # steps:
+ # - uses: actions/checkout@v4
+ # - uses: julia-actions/setup-julia@v2
+ # with:
+ # version: "1"
+ # - uses: julia-actions/cache@v2
+ # - name: Configure doc environment
+ # shell: julia --project=docs --color=yes {0}
+ # run: |
+ # using Pkg
+ # Pkg.develop(PackageSpec(path=pwd()))
+ # Pkg.instantiate()
+ # - uses: julia-actions/julia-buildpkg@v1
+ # - uses: julia-actions/julia-docdeploy@v1
+ # env:
+ # GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+ # DOCUMENTER_KEY: ${{ secrets.DOCUMENTER_KEY }}
+ # - name: Run doctests
+ # shell: julia --project=docs --color=yes {0}
+ # run: |
+ # using Documenter: DocMeta, doctest
+ # using DocsScraper
+ # DocMeta.setdocmeta!(DocsScraper, :DocTestSetup, :(using DocsScraper); recursive=true)
+ # doctest(DocsScraper)
diff --git a/Project.toml b/Project.toml
index bc05f3f..16502d1 100644
--- a/Project.toml
+++ b/Project.toml
@@ -10,12 +10,15 @@ Gumbo = "708ec375-b3d6-5a57-a7ce-8257bf98657a"
HDF5 = "f67ccb44-e63f-5c2f-98bd-6dc0ccc4ba2f"
HTTP = "cd3eb016-35fb-5094-929b-558a96fad6f3"
Inflate = "d25df0c9-e2be-5dd7-82c8-3ad0b3e990b9"
+LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
PromptingTools = "670122d1-24a8-4d70-bfce-740807c42192"
+SHA = "ea8e919c-243c-51af-8825-aaa63cd721ce"
+Serialization = "9e88b42a-f829-5b0c-bbe9-9e923198166b"
+SparseArrays = "2f01184e-e22b-5df5-ae63-d93ebab69eaf"
Tar = "a4e569a6-e804-4fa4-b0f3-eef7a1d5b13e"
URIParser = "30578b45-9adc-5946-b283-645ec420af67"
URIs = "5c2747f8-b7ea-4ff2-ba2e-563bfd36b1d4"
-LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
-SparseArrays = "2f01184e-e22b-5df5-ae63-d93ebab69eaf"
+Unicode = "4ec0a83e-493e-50e2-b9ac-8f72acf5a8f5"
[compat]
AbstractTrees = "0.4.5"
@@ -24,12 +27,14 @@ Gumbo = "0.8.2"
HDF5 = "0.17.2"
HTTP = "1.10.4"
Inflate = "0.1.5"
+LinearAlgebra = "<0.0.1, 1"
PromptingTools = "0.36.0"
+SparseArrays = "<0.0.1, 1"
+Tar = "1.10.0"
URIParser = "0.4.1"
URIs = "1.5.1"
-Tar = "1.10.0"
-LinearAlgebra = "<0.0.1, 1"
-SparseArrays = "<0.0.1, 1"
+SHA = "0.7.0"
+
[extras]
Aqua = "4c88cf16-eb10-579e-8560-4a9242c79595"
From 6f32002254aaa8ff829225fb9d1a92cd1980398c Mon Sep 17 00:00:00 2001
From: Shreyas Agrawal <48771895+splendidbug@users.noreply.github.com>
Date: Wed, 14 Aug 2024 15:00:36 -0700
Subject: [PATCH 5/7] code imptrovements
---
.JuliaFormatter.toml | 1 +
.github/workflows/CI.yml | 70 +++++++--------
.gitignore | 4 +-
.vscode/settings.json | 6 --
LICENSE | 2 +-
Project.toml | 38 ++++----
docs/Project.toml | 10 +--
docs/make.jl | 29 ++++---
docs/src/index.md | 8 +-
src/DocsScraper.jl | 11 ++-
src/crawl.jl | 18 ++--
src/extract_package_name.jl | 162 +++++++++++++++++++++++++++++++++++
src/extract_urls.jl | 2 +-
src/make_knowledge_packs.jl | 69 ++++++++++-----
src/parser.jl | 5 +-
src/preparation.jl | 33 ++++---
src/user_preferences.jl | 4 +-
src/utils.jl | 72 ++++++++++++----
test/crawl.jl | 7 ++
test/make_knowledge_packs.jl | 8 ++
test/parser.jl | 11 +++
test/runtests.jl | 56 ++----------
test/utils.jl | 10 +++
23 files changed, 427 insertions(+), 209 deletions(-)
delete mode 100644 .vscode/settings.json
create mode 100644 src/extract_package_name.jl
create mode 100644 test/crawl.jl
create mode 100644 test/make_knowledge_packs.jl
create mode 100644 test/parser.jl
create mode 100644 test/utils.jl
diff --git a/.JuliaFormatter.toml b/.JuliaFormatter.toml
index 5657bd0..9601a61 100644
--- a/.JuliaFormatter.toml
+++ b/.JuliaFormatter.toml
@@ -1,2 +1,3 @@
# See https://domluna.github.io/JuliaFormatter.jl/stable/ for a list of options
style = "sciml"
+ignore = ["knowledge_packs"]
\ No newline at end of file
diff --git a/.github/workflows/CI.yml b/.github/workflows/CI.yml
index 1c00a7a..5cd2adb 100644
--- a/.github/workflows/CI.yml
+++ b/.github/workflows/CI.yml
@@ -38,39 +38,39 @@ jobs:
- uses: julia-actions/julia-buildpkg@v1
- uses: julia-actions/julia-runtest@v1
- uses: julia-actions/julia-processcoverage@v1
- - uses: codecov/codecov-action@v4
+ # - uses: codecov/codecov-action@v4
+ # with:
+ # files: lcov.info
+ # token: ${{ secrets.CODECOV_TOKEN }}
+ # fail_ci_if_error: false
+ docs:
+ name: Documentation
+ runs-on: ubuntu-latest
+ permissions:
+ actions: write # needed to allow julia-actions/cache to proactively delete old caches that it has created
+ contents: write
+ statuses: write
+ steps:
+ - uses: actions/checkout@v4
+ - uses: julia-actions/setup-julia@v2
with:
- files: lcov.info
- token: ${{ secrets.CODECOV_TOKEN }}
- fail_ci_if_error: false
- # docs:
- # name: Documentation
- # runs-on: ubuntu-latest
- # permissions:
- # actions: write # needed to allow julia-actions/cache to proactively delete old caches that it has created
- # contents: write
- # statuses: write
- # steps:
- # - uses: actions/checkout@v4
- # - uses: julia-actions/setup-julia@v2
- # with:
- # version: "1"
- # - uses: julia-actions/cache@v2
- # - name: Configure doc environment
- # shell: julia --project=docs --color=yes {0}
- # run: |
- # using Pkg
- # Pkg.develop(PackageSpec(path=pwd()))
- # Pkg.instantiate()
- # - uses: julia-actions/julia-buildpkg@v1
- # - uses: julia-actions/julia-docdeploy@v1
- # env:
- # GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
- # DOCUMENTER_KEY: ${{ secrets.DOCUMENTER_KEY }}
- # - name: Run doctests
- # shell: julia --project=docs --color=yes {0}
- # run: |
- # using Documenter: DocMeta, doctest
- # using DocsScraper
- # DocMeta.setdocmeta!(DocsScraper, :DocTestSetup, :(using DocsScraper); recursive=true)
- # doctest(DocsScraper)
+ version: "1"
+ - uses: julia-actions/cache@v2
+ - name: Configure doc environment
+ shell: julia --project=docs --color=yes {0}
+ run: |
+ using Pkg
+ Pkg.develop(PackageSpec(path=pwd()))
+ Pkg.instantiate()
+ - uses: julia-actions/julia-buildpkg@v1
+ - uses: julia-actions/julia-docdeploy@v1
+ env:
+ GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+ DOCUMENTER_KEY: ${{ secrets.DOCUMENTER_KEY }}
+ - name: Run doctests
+ shell: julia --project=docs --color=yes {0}
+ run: |
+ using Documenter: DocMeta, doctest
+ using DocsScraper
+ DocMeta.setdocmeta!(DocsScraper, :DocTestSetup, :(using DocsScraper); recursive=true)
+ doctest(DocsScraper)
diff --git a/.gitignore b/.gitignore
index 8e2d4ba..4a1c7f4 100644
--- a/.gitignore
+++ b/.gitignore
@@ -4,4 +4,6 @@ knowledge_packs/
Manifest.toml
/Manifest.toml
/docs/Manifest.toml
-/docs/build/
\ No newline at end of file
+/docs/build/
+.vscode/**
+**/.DS_Store
\ No newline at end of file
diff --git a/.vscode/settings.json b/.vscode/settings.json
deleted file mode 100644
index 9238ca7..0000000
--- a/.vscode/settings.json
+++ /dev/null
@@ -1,6 +0,0 @@
-{
- "cSpell.words": [
- "eachmatch",
- "postprocess"
- ]
-}
diff --git a/LICENSE b/LICENSE
index d7bd022..183f1b7 100644
--- a/LICENSE
+++ b/LICENSE
@@ -1,6 +1,6 @@
MIT License
-Copyright (c) Shreyas Agrawal @splendidbug and J S @svilupp
+Copyright (c) Shreyas Agrawal @splendidbug and contributors
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
diff --git a/Project.toml b/Project.toml
index 16502d1..1fb77c2 100644
--- a/Project.toml
+++ b/Project.toml
@@ -1,40 +1,46 @@
name = "DocsScraper"
uuid = "bd71d052-5e08-40cc-a492-eb4e8da4b649"
-authors = ["Shreyas Agrawal @splendidbug and J S @svilupp"]
+authors = ["Shreyas Agrawal @splendidbug and contributors"]
version = "0.1.0"
[deps]
AbstractTrees = "1520ce14-60c1-5f80-bbc7-55ef81b5835c"
+Dates = "ade2ca70-3891-5945-98fb-dc099432e06a"
EzXML = "8f5d6c58-4d21-5cfd-889c-e3ad7ee6a615"
Gumbo = "708ec375-b3d6-5a57-a7ce-8257bf98657a"
HDF5 = "f67ccb44-e63f-5c2f-98bd-6dc0ccc4ba2f"
HTTP = "cd3eb016-35fb-5094-929b-558a96fad6f3"
Inflate = "d25df0c9-e2be-5dd7-82c8-3ad0b3e990b9"
+JSON = "682c06a0-de6a-54ab-a142-c8b1cf79cde6"
LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
PromptingTools = "670122d1-24a8-4d70-bfce-740807c42192"
SHA = "ea8e919c-243c-51af-8825-aaa63cd721ce"
Serialization = "9e88b42a-f829-5b0c-bbe9-9e923198166b"
SparseArrays = "2f01184e-e22b-5df5-ae63-d93ebab69eaf"
Tar = "a4e569a6-e804-4fa4-b0f3-eef7a1d5b13e"
-URIParser = "30578b45-9adc-5946-b283-645ec420af67"
URIs = "5c2747f8-b7ea-4ff2-ba2e-563bfd36b1d4"
Unicode = "4ec0a83e-493e-50e2-b9ac-8f72acf5a8f5"
[compat]
-AbstractTrees = "0.4.5"
-EzXML = "1.2.0"
-Gumbo = "0.8.2"
-HDF5 = "0.17.2"
-HTTP = "1.10.4"
-Inflate = "0.1.5"
-LinearAlgebra = "<0.0.1, 1"
-PromptingTools = "0.36.0"
-SparseArrays = "<0.0.1, 1"
-Tar = "1.10.0"
-URIParser = "0.4.1"
-URIs = "1.5.1"
-SHA = "0.7.0"
-
+AbstractTrees = "0.4"
+Aqua = "0.8"
+Dates = "1"
+EzXML = "1.2"
+Gumbo = "0.8"
+HDF5 = "0.17"
+HTTP = "1.10"
+Inflate = "0.1"
+LinearAlgebra = "1"
+PromptingTools = "0.48"
+SHA = "0.7"
+Serialization = "1"
+SparseArrays = "1"
+Tar = "1"
+Test = "1"
+URIs = "1.5"
+Unicode = "1"
+julia = "1.10"
+JSON = "0.21"
[extras]
Aqua = "4c88cf16-eb10-579e-8560-4a9242c79595"
diff --git a/docs/Project.toml b/docs/Project.toml
index 41b0b18..15c39b1 100644
--- a/docs/Project.toml
+++ b/docs/Project.toml
@@ -1,19 +1,15 @@
[deps]
AbstractTrees = "1520ce14-60c1-5f80-bbc7-55ef81b5835c"
+DocsScraper = "bd71d052-5e08-40cc-a492-eb4e8da4b649"
+Documenter = "e30172f5-a6a5-5a46-863b-614d45cd2de4"
DotEnv = "4dc1fcf4-5e3b-5448-94ab-0c38ec0385c1"
EzXML = "8f5d6c58-4d21-5cfd-889c-e3ad7ee6a615"
Gumbo = "708ec375-b3d6-5a57-a7ce-8257bf98657a"
HDF5 = "f67ccb44-e63f-5c2f-98bd-6dc0ccc4ba2f"
HTTP = "cd3eb016-35fb-5094-929b-558a96fad6f3"
Inflate = "d25df0c9-e2be-5dd7-82c8-3ad0b3e990b9"
+LiveServer = "16fef848-5104-11e9-1b77-fb7a48bbb589"
PromptingTools = "670122d1-24a8-4d70-bfce-740807c42192"
Tar = "a4e569a6-e804-4fa4-b0f3-eef7a1d5b13e"
URIParser = "30578b45-9adc-5946-b283-645ec420af67"
URIs = "5c2747f8-b7ea-4ff2-ba2e-563bfd36b1d4"
-
-[compat]
-AbstractTrees = "0.4.5"
-Gumbo = "0.8.2"
-HTTP = "1.10.4"
-PromptingTools = "0.36.0"
-URIs = "1.5.1"
diff --git a/docs/make.jl b/docs/make.jl
index a54f0f6..47bd6f5 100644
--- a/docs/make.jl
+++ b/docs/make.jl
@@ -1,23 +1,24 @@
using DocsScraper
using Documenter
-DocMeta.setdocmeta!(DocsScraper, :DocTestSetup, :(using DocsScraper); recursive=true)
+DocMeta.setdocmeta!(DocsScraper, :DocTestSetup, :(using DocsScraper); recursive = true)
makedocs(;
- modules=[DocsScraper],
- authors="Shreyas Agrawal @splendidbug and J S @svilupp",
- sitename="DocsScraper.jl",
- # format=Documenter.HTML(;
- # canonical="https://Shreyas Agrawal.github.io/DocsScraper.jl",
- # edit_link="master",
- # assets=String[],
- # ),
- pages=[
- "Home" => "index.md",
- ],
+ modules = [DocsScraper],
+ authors = "Shreyas Agrawal @splendidbug and contributors",
+ sitename = "DocsScraper.jl",
+ repo = "https://github.com/splendidbug/DocsScraper.jl/blob/{commit}{path}#{line}",
+ format = Documenter.HTML(;
+ repolink = "https://github.com/splendidbug/DocsScraper.jl",
+ canonical = "https://splendidbug.github.io/DocsScraper.jl",
+ edit_link = "main",
+ assets = String[]),
+ pages = [
+ "API Index" => "index.md"
+ ]
)
deploydocs(;
- repo="github.com/Shreyas Agrawal/DocsScraper.jl",
- devbranch="main",
+ repo = "github.com/splendidbug/DocsScraper.jl",
+ devbranch = "main"
)
diff --git a/docs/src/index.md b/docs/src/index.md
index a6f0129..c30e1af 100644
--- a/docs/src/index.md
+++ b/docs/src/index.md
@@ -1,4 +1,8 @@
-# DocsScraper
+# Reference
-## Documentation
+```@index
+```
+```@autodocs
+Modules = [DocsScraper]
+```
\ No newline at end of file
diff --git a/src/DocsScraper.jl b/src/DocsScraper.jl
index 40bc3ee..7f114d9 100644
--- a/src/DocsScraper.jl
+++ b/src/DocsScraper.jl
@@ -9,20 +9,23 @@ using LinearAlgebra, Unicode, SparseArrays
using HDF5
using Tar
using Inflate
-
using SHA
using Serialization, URIs
+using Dates
+using JSON
include("parser.jl")
include("crawl.jl")
include("extract_urls.jl")
include("preparation.jl")
+include("extract_package_name.jl")
+export get_package_name
include("make_knowledge_packs.jl")
-export make_knowledge_packs, just_generate
+export make_knowledge_packs
include("user_preferences.jl")
include("utils.jl")
-export remove_urls_from_index
+export remove_urls_from_index, urls_for_metadata
-end
\ No newline at end of file
+end
diff --git a/src/crawl.jl b/src/crawl.jl
index a8f93c9..c972ef2 100644
--- a/src/crawl.jl
+++ b/src/crawl.jl
@@ -5,7 +5,7 @@
Parse the robots.txt string and return rules and the URLs on Sitemap
"""
function parse_robots_txt!(robots_txt::String)
- rules = Dict{String,Dict{String,Vector{String}}}()
+ rules = Dict{String, Dict{String, Vector{String}}}()
current_user_agent = ""
sitemap_urls = Vector{AbstractString}()
@@ -14,7 +14,8 @@ function parse_robots_txt!(robots_txt::String)
if startswith(line, "User-agent:")
current_user_agent = strip(split(line, ":")[2])
if !haskey(rules, current_user_agent)
- rules[current_user_agent] = Dict("Disallow" => Vector{String}(), "Allow" => Vector{String}())
+ rules[current_user_agent] = Dict(
+ "Disallow" => Vector{String}(), "Allow" => Vector{String}())
end
elseif startswith(line, "Disallow:")
disallow_path = strip(split(line, ":")[2])
@@ -30,12 +31,10 @@ function parse_robots_txt!(robots_txt::String)
url = strip(split(line, ":")[2])
push!(sitemap_urls, url)
end
-
end
return rules, sitemap_urls
end
-
"""
check_robots_txt(user_agent::AbstractString, url::AbstractString)
@@ -99,14 +98,12 @@ end
Extract the base url
"""
function get_base_url(url::AbstractString)
-
parsed_url = URIs.URI(url)
base_url = string(parsed_url.scheme, "://", parsed_url.host,
parsed_url.port != nothing ? "" * string(parsed_url.port) : "", parsed_url.path)
return base_url
end
-
"""
process_hostname(url::AbstractString)
@@ -118,7 +115,6 @@ function process_hostname(url::AbstractString)
return hostname
end
-
"""
process_hostname(url::AbstractString, hostname_dict::Dict{AbstractString,Vector{AbstractString}})
@@ -128,7 +124,8 @@ Add `url` to its hostname in `hostname_dict`
- `url`: URL string
- `hostname_dict`: Dict with key being hostname and value being a vector of URLs
"""
-function process_hostname!(url::AbstractString, hostname_dict::Dict{AbstractString,Vector{AbstractString}})
+function process_hostname!(
+ url::AbstractString, hostname_dict::Dict{AbstractString, Vector{AbstractString}})
hostname = process_hostname(url)
# Add the URL to the dictionary under its hostname
@@ -139,17 +136,15 @@ function process_hostname!(url::AbstractString, hostname_dict::Dict{AbstractStri
end
end
-
"""
crawl(input_urls::Vector{<:AbstractString})
Crawl on the input URLs and return a `hostname_url_dict` which is a dictionary with key being hostnames and the values being the URLs
"""
function crawl(input_urls::Vector{<:AbstractString})
-
url_queue = Vector{AbstractString}(input_urls)
visited_url_set = Set{AbstractString}()
- hostname_url_dict = Dict{AbstractString,Vector{AbstractString}}()
+ hostname_url_dict = Dict{AbstractString, Vector{AbstractString}}()
sitemap_urls = Vector{AbstractString}()
# TODO: Add parallel processing for URLs
@@ -174,5 +169,4 @@ function crawl(input_urls::Vector{<:AbstractString})
end
return hostname_url_dict, visited_url_set
-
end
diff --git a/src/extract_package_name.jl b/src/extract_package_name.jl
new file mode 100644
index 0000000..525cecf
--- /dev/null
+++ b/src/extract_package_name.jl
@@ -0,0 +1,162 @@
+"""
+ clean_url(url::String)
+
+Strip URL of any http:// ot https:// or www. prefixes
+"""
+function clean_url(url::String)
+ # Remove http://, https://, www., or wwws.
+ cleaned_url = replace(url, r"^https?://(www\d?\.)?" => "")
+ return cleaned_url
+end
+
+"""
+ base_url_segment(url::String)
+
+Return the base url and first path segment if all the other checks fail
+"""
+function base_url_segment(url::String)
+ # Clean the URL from unwanted prefixes
+ cleaned_url = clean_url(url)
+
+ # Parse the cleaned URL
+ uri = URI("https://" * cleaned_url) # Add https:// to ensure correct parsing
+
+ # Extract the base URL (host)
+ base_url = replace(uri.host, r"^www\." => "")
+
+ # Extract the first path segment
+ path_segments = split(uri.path, "/"; keepempty = false)
+
+ if !isempty(path_segments)
+ first_segment = path_segments[1]
+ return "$base_url/$first_segment"
+ else
+ return base_url
+ end
+end
+
+"""
+ url_package_name(url::AbstractString)
+
+Return the text if the URL itself contains the package name with ".jl" or "_jl" suffixes
+"""
+function url_package_name(url::AbstractString)
+ if occursin(r"\.jl", url) || occursin(r"_jl", url)
+ package_name = match(r"[\/]([^\/]+(?:\.jl|_jl))", url)
+ return package_name.captures[1]
+ end
+ return ""
+end
+
+"""
+ get_base_url(url::AbstractString)
+
+Extract the base url
+"""
+function get_base_url(url::AbstractString)
+ parsed_url = URIs.URI(url)
+ base_url = string(parsed_url.scheme, "://", parsed_url.host,
+ parsed_url.port != nothing ? ":" * string(parsed_url.port) : "", parsed_url.path)
+ return base_url
+end
+
+"""
+ nav_bar(url::AbstractString)
+
+Julia doc websites tend to have the package name under ".docs-package-name" class in the HTML tree
+"""
+function nav_bar(url::AbstractString)
+ base_url = get_base_url(url)
+ fetched_content = HTTP.get(base_url)
+ parsed = Gumbo.parsehtml(String(fetched_content.body))
+ content_candidates = [el
+ for el in AbstractTrees.PreOrderDFS(parsed.root)
+ if el isa HTMLElement]
+ content_by_class = filter(
+ el -> getattr(el, "class", nothing) in ["docs-package-name"], content_candidates)
+ if (!isempty(content_by_class))
+ parsed_blocks = Vector{Dict{String, Any}}([Dict("Source" => base_url)])
+ heading_hierarchy = Dict{Symbol, Any}()
+ process_node!(only(content_by_class), heading_hierarchy, parsed_blocks)
+ package_name = parsed_blocks[2]["text"]
+ return package_name
+ end
+ return ""
+end
+
+"""
+ text_before_version(url::AbstractString)
+
+Return text before "stable" or "dev" or any version in URL. It is generally observed that doc websites have package names before their versions
+"""
+function text_before_version(url::AbstractString)
+ language_prefixes = [
+ "/en/", "/es/", "/fr/", "/de/", "/it/", "/pt/", "/ru/", "/zh/", "/ja/", "/ko/"]
+ contains_prefix = any(occursin(prefix, url) for prefix in language_prefixes)
+ if contains_prefix
+ pattern = r"/([^/]+)/([^/]+)/(?:stable|dev|latest|v\d+(\.\d+)*)(?:/|$)"
+ else
+ pattern = r"/([^/]+)/(?:stable|dev|latest|v\d+(\.\d+)*)"
+ end
+ package_name = match(pattern, url)
+ if package_name !== nothing
+ return package_name.captures[1]
+ end
+ return ""
+end
+
+"""
+ docs_in_url(url::AbstractString)
+
+If the base url is in the form docs.package_name.domain_extension, then return the middle word i.e., package_name
+"""
+function docs_in_url(url::AbstractString)
+ cleaned_url = clean_url(url)
+
+ # Parse the cleaned URL
+ uri = URI("https://" * cleaned_url) # Add https:// to ensure correct parsing
+
+ # Extract the base URL (host)
+ base_url = replace(uri.host, r"^www\." => "")
+ pattern = r"docs\.([^.]+)\.(org|com|ai|net|io|co|tech)"
+ m = match(pattern, base_url)
+ if m !== nothing
+ return m.captures[1]
+ end
+ return ""
+end
+
+"""
+ get_package_name(url::AbstractString)
+
+Return name of the package through the package URL
+"""
+function get_package_name(url::AbstractString)
+
+ # try 1: look for package name in URL
+ package_name = url_package_name(url)
+ if (!isempty(package_name))
+ return package_name
+ end
+
+ # try 2: look for package name in nav bar
+ package_name = nav_bar(url)
+ if (!isempty(package_name))
+ return package_name
+ end
+
+ # try 3: if the base url is in the form docs.package_name.domain_extension
+ package_name = docs_in_url(url)
+ if (!isempty(package_name))
+ return package_name
+ end
+
+ # try 4: get text before "stable" or "dev" or any version in URL
+ package_name = text_before_version(url)
+ if (!isempty(package_name))
+ return package_name
+ end
+
+ # fallback: return base URL with first path segment
+ return base_url_segment(url)
+end
diff --git a/src/extract_urls.jl b/src/extract_urls.jl
index d5e8fcf..d750f34 100644
--- a/src/extract_urls.jl
+++ b/src/extract_urls.jl
@@ -141,4 +141,4 @@ function get_urls!(url::AbstractString, url_queue::Vector{<:AbstractString})
else
find_urls_html!(url, parsed.root, url_queue)
end
-end
\ No newline at end of file
+end
diff --git a/src/make_knowledge_packs.jl b/src/make_knowledge_packs.jl
index 291a9c7..5d56ff8 100644
--- a/src/make_knowledge_packs.jl
+++ b/src/make_knowledge_packs.jl
@@ -24,8 +24,8 @@ function create_output_folders(knowledge_pack_path::String)
end
"""
- make_chunks(hostname_url_dict::Dict{AbstractString,Vector{AbstractString}}, knowledge_pack_path::String; max_chunk_size::Int=MAX_CHUNK_SIZE,
- min_chunk_size::Int=MIN_CHUNK_SIZE)
+ make_chunks(hostname_url_dict::Dict{AbstractString,Vector{AbstractString}}, knowledge_pack_path::String;
+ max_chunk_size::Int=MAX_CHUNK_SIZE, min_chunk_size::Int=MIN_CHUNK_SIZE)
Parse URLs from hostname_url_dict and save the chunks
@@ -44,7 +44,8 @@ function make_chunks(hostname_url_dict::Dict{AbstractString, Vector{AbstractStri
output_sources = Vector{String}()
for url in urls
try
- chunks, sources = process_paths(url; max_chunk_size, min_chunk_size)
+ chunks, sources = process_paths(
+ url; max_chunk_size, min_chunk_size)
append!(output_chunks, chunks)
append!(output_sources, sources)
catch
@@ -85,16 +86,20 @@ function l2_norm_columns(vect::AbstractVector)
end
"""
- generate_embeddings(knowledge_pack_path::String; model::AbstractString=MODEL, embedding_size::Int=EMBEDDING_SIZE)
+ generate_embeddings(knowledge_pack_path::String; model::AbstractString=MODEL,
+ embedding_size::Int=EMBEDDING_SIZE, custom_metadata::AbstractString)
Deserialize chunks and sources to generate embeddings
# Arguments
- model: Embedding model
- embedding_size: Embedding dimensions
+- custom_metadata: Custom metadata like ecosystem name if required
"""
-function generate_embeddings(knowledge_pack_path::String; model::AbstractString = MODEL,
- embedding_size::Int = EMBEDDING_SIZE)
+function generate_embeddings(
+ knowledge_pack_path::String; max_chunk_size::Int = MAX_CHUNK_SIZE,
+ model::AbstractString = MODEL,
+ embedding_size::Int = EMBEDDING_SIZE, custom_metadata::AbstractString)
embedder = RT.BatchEmbedder()
entries = readdir(knowledge_pack_path)
# Initialize a dictionary to group files by hostname and chunk size
@@ -114,31 +119,31 @@ function generate_embeddings(knowledge_pack_path::String; model::AbstractString
if match_chunks !== nothing
hostname = match_chunks.captures[1]
- chunk_size = parse(Int, match_chunks.captures[2])
+ max_chunk_size = parse(Int, match_chunks.captures[2])
if !haskey(hostname_files, hostname)
hostname_files[hostname] = Dict{Int, Dict{String, String}}()
end
- if !haskey(hostname_files[hostname], chunk_size)
- hostname_files[hostname][chunk_size] = Dict{String, String}()
+ if !haskey(hostname_files[hostname], max_chunk_size)
+ hostname_files[hostname][max_chunk_size] = Dict{String, String}()
end
- hostname_files[hostname][chunk_size]["chunks"] = joinpath(
+ hostname_files[hostname][max_chunk_size]["chunks"] = joinpath(
knowledge_pack_path, file)
elseif match_sources !== nothing
hostname = match_sources.captures[1]
- chunk_size = parse(Int, match_sources.captures[2])
+ max_chunk_size = parse(Int, match_sources.captures[2])
if !haskey(hostname_files, hostname)
hostname_files[hostname] = Dict{Int, Dict{String, String}}()
end
- if !haskey(hostname_files[hostname], chunk_size)
- hostname_files[hostname][chunk_size] = Dict{String, String}()
+ if !haskey(hostname_files[hostname], max_chunk_size)
+ hostname_files[hostname][max_chunk_size] = Dict{String, String}()
end
- hostname_files[hostname][chunk_size]["sources"] = joinpath(
+ hostname_files[hostname][max_chunk_size]["sources"] = joinpath(
knowledge_pack_path, file)
end
end
# Process each pair of files
for (hostname, chunk_files) in hostname_files
- for (chunk_size, files) in chunk_files
+ for (max_chunk_size, files) in chunk_files
if haskey(files, "chunks") && haskey(files, "sources")
chunks_file = files["chunks"]
sources_file = files["sources"]
@@ -148,17 +153,31 @@ function generate_embeddings(knowledge_pack_path::String; model::AbstractString
full_embeddings = RT.get_embeddings(
embedder, chunks; model, verbose = false, cost_tracker)
@info "Created embeddings for $hostname. Cost: \$$(round(cost_tracker[], digits=3))"
+
+ trunc = embedding_size < EMBEDDING_SIZE ? 1 : 0
fn_output = joinpath(knowledge_pack_path, "packs",
- "$hostname-textembedding3large-0-Float32__v1.0.tar.gz")
+ "$hostname-$model-$trunc-Float32__v1.0.tar.gz")
fn_temp = joinpath(knowledge_pack_path, "packs",
- "$hostname-textembedding3large-0-Float32__v1.0.hdf5")
+ "$hostname-$model-$trunc-Float32__v1.0.hdf5")
+
h5open(fn_temp, "w") do file
file["chunks"] = chunks
file["sources"] = sources
file["embeddings"] = full_embeddings[1:embedding_size, :] |>
l2_norm_columns |> x -> map(>(0), x)
file["type"] = "ChunkIndex"
- # file["metadata"] = "$hostname ecosystem docstrings, chunk size $chunk_size, downloaded on 20240330, contains: Makie.jl, AlgebraOfGraphics.jl, GeoMakie.jl, GraphMakie.jl, MakieThemes.jl, TopoPlots.jl, Tyler.jl"
+
+ package_url_dict = Dict{String, Vector{String}}()
+ package_url_dict = urls_for_metadata(sources)
+
+ metadata = Dict(
+ :embedded_dt => Dates.today(),
+ :custom_metadata => custom_metadata, :max_chunk_size => max_chunk_size,
+ :embedding_size => embedding_size, :model => model,
+ :packages => package_url_dict)
+
+ metadata_json = JSON.json(metadata)
+ file["metadata"] = metadata_json
end
command = `tar -cvzf $fn_output -C $(dirname(fn_temp)) $(basename(fn_temp))`
@@ -166,7 +185,7 @@ function generate_embeddings(knowledge_pack_path::String; model::AbstractString
report_artifact(fn_output)
else
- @warn "Missing pair for hostname: $hostname, chunk size: $chunk_size"
+ @warn "Missing pair for hostname: $hostname, max chunk size: $max_chunk_size"
end
end
end
@@ -174,7 +193,8 @@ end
"""
make_knowledge_packs(crawlable_urls::Vector{<:AbstractString}=String[]; single_urls::Vector{<:AbstractString}=String[],
- max_chunk_size::Int=MAX_CHUNK_SIZE, min_chunk_size::Int=MIN_CHUNK_SIZE, model::AbstractString=MODEL, embedding_size::Int=EMBEDDING_SIZE)
+ max_chunk_size::Int=MAX_CHUNK_SIZE, min_chunk_size::Int=MIN_CHUNK_SIZE, model::AbstractString=MODEL, embedding_size::Int=EMBEDDING_SIZE,
+ custom_metadata::AbstractString)
Entry point to crawl, parse and generate embeddings
@@ -185,11 +205,12 @@ Entry point to crawl, parse and generate embeddings
- min_chunk_size: Minimum chunk size
- model: Embedding model
- embedding_size: Embedding dimensions
+- custom_metadata: Custom metadata like ecosystem name if required
"""
function make_knowledge_packs(crawlable_urls::Vector{<:AbstractString} = String[];
single_urls::Vector{<:AbstractString} = String[],
max_chunk_size::Int = MAX_CHUNK_SIZE, min_chunk_size::Int = MIN_CHUNK_SIZE,
- model::AbstractString = MODEL, embedding_size::Int = EMBEDDING_SIZE)
+ model::AbstractString = MODEL, embedding_size::Int = EMBEDDING_SIZE, custom_metadata::AbstractString = "")
if isempty(crawlable_urls) && isempty(single_urls)
error("At least one of `input_urls` or `single_pages` must be provided.")
end
@@ -217,6 +238,8 @@ function make_knowledge_packs(crawlable_urls::Vector{<:AbstractString} = String[
end
knowledge_pack_path = joinpath(@__DIR__, "..", "knowledge_packs")
create_output_folders(knowledge_pack_path)
- make_chunks(hostname_url_dict, knowledge_pack_path; max_chunk_size, min_chunk_size)
- generate_embeddings(knowledge_pack_path; model, embedding_size)
+ make_chunks(
+ hostname_url_dict, knowledge_pack_path; max_chunk_size, min_chunk_size)
+ generate_embeddings(
+ knowledge_pack_path; max_chunk_size, model, embedding_size, custom_metadata)
end
diff --git a/src/parser.jl b/src/parser.jl
index def1a17..2de7035 100644
--- a/src/parser.jl
+++ b/src/parser.jl
@@ -340,9 +340,6 @@ end
parse_url(url::AbstractString)
Initiator and main function to parse HTML from url. Return a Vector of Dict containing Heading/Text/Code along with a Dict of respective metadata
-
-# Arguments
-- `url`: URL string to parse
"""
function parse_url_to_blocks(url::AbstractString)
@@ -356,7 +353,7 @@ function parse_url_to_blocks(url::AbstractString)
# Getting title of the document
# title = [el
# for el in AbstractTrees.PreOrderDFS(r_parsed.root)
- # if el isa HTMLElement && tag(el) == :title] .|> text |> Base.Fix2(join, " / ")
+ # if el isa HTMLElement && tag(el) == :title] .|> text |> Base.Fix2(join, " / ")
parsed_blocks = Vector{Dict{String, Any}}([Dict("Source" => base_url)])
heading_hierarchy = Dict{Symbol, Any}()
process_node!(get_html_content(parsed.root), heading_hierarchy, parsed_blocks)
diff --git a/src/preparation.jl b/src/preparation.jl
index 9979155..8736050 100644
--- a/src/preparation.jl
+++ b/src/preparation.jl
@@ -10,22 +10,21 @@ get_header_path(d)
# Output: "Axis/Attributes/yzoomkey"
```
"""
-function get_header_path(d::Dict{String,Any})
- metadata = get(d, "metadata", Dict{Any,Any}())
+function get_header_path(d::Dict{String, Any})
+ metadata = get(d, "metadata", Dict{Any, Any}())
isempty(metadata) && return nothing
keys_ = [:h1, :h2, :h3]
vals = get.(Ref(metadata), keys_, "") |> x -> filter(!isempty, x) |> x -> join(x, "/")
isempty(vals) ? nothing : vals
end
-
-
"""
roll_up_chunks(parsed_blocks::Vector{Dict{String,Any}}, url::AbstractString; separator::String="")
Roll-up chunks (that have the same header!), so we can split them later by to get the desired length
"""
-function roll_up_chunks(parsed_blocks::Vector{Dict{String,Any}}, url::AbstractString; separator::String="")
+function roll_up_chunks(parsed_blocks::Vector{Dict{String, Any}},
+ url::AbstractString; separator::String = "")
docs = String[]
io = IOBuffer()
last_header = nothing
@@ -57,7 +56,6 @@ function roll_up_chunks(parsed_blocks::Vector{Dict{String,Any}}, url::AbstractSt
return docs, sources
end
-
struct DocParserChunker <: RT.AbstractChunker end
"""
@@ -74,9 +72,9 @@ and splits them by separators to get the desired length.
- separators: Chunk separators
- max_chunk_size Maximum chunk size
"""
-function RT.get_chunks(chunker::DocParserChunker, url::AbstractString;
- verbose::Bool=true, separators=["\n\n", ". ", "\n", " "], max_chunk_size::Int=MAX_CHUNK_SIZE)
-
+function RT.get_chunks(
+ chunker::DocParserChunker, url::AbstractString;
+ verbose::Bool = true, separators = ["\n\n", ". ", "\n", " "], max_chunk_size::Int = MAX_CHUNK_SIZE)
SEP = ""
sources = AbstractVector{<:AbstractString}
output_chunks = Vector{SubString{String}}()
@@ -86,14 +84,14 @@ function RT.get_chunks(chunker::DocParserChunker, url::AbstractString;
parsed_blocks = parse_url_to_blocks(url)
## Roll up to the same header
- docs_, sources_ = roll_up_chunks(parsed_blocks, url; separator=SEP)
+ docs_, sources_ = roll_up_chunks(parsed_blocks, url; separator = SEP)
## roll up chunks by SEP splitter, then remove it later
for (doc, src) in zip(docs_, sources_)
## roll up chunks by SEP splitter, then remove it later
- doc_chunks = PT.recursive_splitter(doc, [SEP, separators...]; max_length=max_chunk_size) .|>
+ doc_chunks = PT.recursive_splitter(
+ doc, [SEP, separators...]; max_length = max_chunk_size) .|>
x -> replace(x, SEP => " ") .|> strip |> x -> filter(!isempty, x)
- chunk_lengths = length.(doc_chunks)
# skip if no chunks found
isempty(doc_chunks) && continue
append!(output_chunks, doc_chunks)
@@ -102,15 +100,14 @@ function RT.get_chunks(chunker::DocParserChunker, url::AbstractString;
return output_chunks, output_sources
end
-
-
"""
process_paths(url::AbstractString; max_chunk_size::Int=MAX_CHUNK_SIZE, min_chunk_size::Int=MIN_CHUNK_SIZE)
Process folders provided in `paths`. In each, take all HTML files, scrape them, chunk them and postprocess them.
"""
-function process_paths(url::AbstractString; max_chunk_size::Int=MAX_CHUNK_SIZE, min_chunk_size::Int=MIN_CHUNK_SIZE)
-
+function process_paths(url::AbstractString;
+ max_chunk_size::Int = MAX_CHUNK_SIZE,
+ min_chunk_size::Int = MIN_CHUNK_SIZE)
output_chunks = Vector{SubString{String}}()
output_sources = Vector{String}()
@@ -119,9 +116,9 @@ function process_paths(url::AbstractString; max_chunk_size::Int=MAX_CHUNK_SIZE,
append!(output_chunks, chunks)
append!(output_sources, sources)
-
@info "Scraping done: $(length(output_chunks)) chunks"
- output_chunks, output_sources = postprocess_chunks(output_chunks, output_sources; min_chunk_size, skip_code=true)
+ output_chunks, output_sources = postprocess_chunks(
+ output_chunks, output_sources; min_chunk_size, skip_code = true)
return output_chunks, output_sources
end
diff --git a/src/user_preferences.jl b/src/user_preferences.jl
index 98794c6..00c1a2f 100644
--- a/src/user_preferences.jl
+++ b/src/user_preferences.jl
@@ -1,4 +1,4 @@
global MIN_CHUNK_SIZE = 40
-global MAX_CHUNK_SIZE = 256
+global MAX_CHUNK_SIZE = 384
global MODEL = "text-embedding-3-large"
-global EMBEDDING_SIZE = 1024
\ No newline at end of file
+global EMBEDDING_SIZE = 3072
diff --git a/src/utils.jl b/src/utils.jl
index e8dc014..dfbc17c 100644
--- a/src/utils.jl
+++ b/src/utils.jl
@@ -30,21 +30,21 @@ end
Remove chunks that are duplicated in the input list of chunks and their corresponding sources.
"""
-function remove_duplicates(chunks::AbstractVector{<:AbstractString}, sources::AbstractVector{<:AbstractString})
+function remove_duplicates(
+ chunks::AbstractVector{<:AbstractString}, sources::AbstractVector{<:AbstractString})
idxs = find_duplicates(chunks)
return chunks[.!idxs], sources[.!idxs]
end
-
"""
remove_short_chunks(chunks::AbstractVector{<:AbstractString}, sources::AbstractVector{<:AbstractString};
min_chunk_size::Int=MIN_CHUNK_SIZE, skip_code::Bool=true)
Remove chunks that are shorter than a specified length (`min_length`) from the input list of chunks and their corresponding sources.
"""
-function remove_short_chunks(chunks::AbstractVector{<:AbstractString}, sources::AbstractVector{<:AbstractString};
- min_chunk_size::Int=MIN_CHUNK_SIZE, skip_code::Bool=true)
-
+function remove_short_chunks(
+ chunks::AbstractVector{<:AbstractString}, sources::AbstractVector{<:AbstractString};
+ min_chunk_size::Int = MIN_CHUNK_SIZE, skip_code::Bool = true)
chunk_lengths = length.(chunks)
idx = if skip_code
## Keep short chunks if they contain code (might be combined with some preceding/succeeding text)
@@ -56,17 +56,15 @@ function remove_short_chunks(chunks::AbstractVector{<:AbstractString}, sources::
return chunks[idx], sources[idx]
end
-
-function replace_local_paths(sources::AbstractVector{<:AbstractString}, paths::AbstractVector{<:AbstractString}, websites::AbstractVector{<:AbstractString})
- @assert length(paths) == length(websites) "Length of `paths` must match length of `websites`"
+function replace_local_paths(
+ sources::AbstractVector{<:AbstractString}, paths::AbstractVector{<:AbstractString},
+ websites::AbstractVector{<:AbstractString})
+ @assert length(paths)==length(websites) "Length of `paths` must match length of `websites`"
replacement_pairs = paths .=> websites
output = map(x -> replace(x, replacement_pairs...), sources)
return output
end
-
-
-
"""
function postprocess_chunks(chunks::AbstractVector{<:AbstractString}, sources::AbstractVector{<:AbstractString};
min_chunk_size::Int=MIN_CHUNK_SIZE, skip_code::Bool=true, paths::Union{Nothing,AbstractVector{<:AbstractString}}=nothing,
@@ -74,9 +72,11 @@ end
Post-process the input list of chunks and their corresponding sources by removing short chunks and duplicates.
"""
-function postprocess_chunks(chunks::AbstractVector{<:AbstractString}, sources::AbstractVector{<:AbstractString};
- min_chunk_size::Int=MIN_CHUNK_SIZE, skip_code::Bool=true, paths::Union{Nothing,AbstractVector{<:AbstractString}}=nothing,
- websites::Union{Nothing,AbstractVector{<:AbstractString}}=nothing)
+function postprocess_chunks(
+ chunks::AbstractVector{<:AbstractString}, sources::AbstractVector{<:AbstractString};
+ min_chunk_size::Int = MIN_CHUNK_SIZE, skip_code::Bool = true,
+ paths::Union{Nothing, AbstractVector{<:AbstractString}} = nothing,
+ websites::Union{Nothing, AbstractVector{<:AbstractString}} = nothing)
len_ = length(chunks)
chunks, sources = remove_short_chunks(chunks, sources; min_chunk_size, skip_code)
@info "Removed $(len_ - length(chunks)) short chunks"
@@ -99,7 +99,8 @@ end
Remove chunks and sources corresponding to URLs starting with `prefix_urls`
"""
-function remove_urls_from_index(index_path::AbstractString, prefix_urls=Vector{<:AbstractString})
+function remove_urls_from_index(
+ index_path::AbstractString, prefix_urls = Vector{<:AbstractString})
@assert endswith(file_path, ".hdf5") "Provided file path must end with `.hdf5` (see HDF5.jl)."
h5open(index_path, "r+") do orig_file
@@ -119,4 +120,43 @@ function remove_urls_from_index(index_path::AbstractString, prefix_urls=Vector{<
write(file["chunks"], chunks)
write(file["embeddings"], embeddings)
end
-end
\ No newline at end of file
+end
+
+"""
+ urls_for_metadata(sources::Vector{String})
+
+Return a Dict of package names with their associated URLs
+Note: Due to their large number, URLs are stripped down to the package name; Package subpaths are not included in metadata.
+"""
+function urls_for_metadata(sources::Vector{String})
+ urls = [split(source, " -")[1] for source in sources]
+ pattern = r"(/(?:stable|dev|latest|v\d+(?:\.\d+)*))"
+ cleaned_urls = [endswith(String(url), "/") ? String(url)[1:(end - 1)] : String(url)
+ for url in urls]
+ unique_urls = unique(cleaned_urls)
+ package_names = Vector{String}()
+
+ for url in unique_urls
+ push!(package_names, get_package_name(String(url)))
+ end
+
+ cleaned_urls = [match(pattern, url) !== nothing ? first(split(url, pattern)) : url
+ for url in unique_urls]
+
+ zipped = zip(cleaned_urls, package_names) |> collect
+ unique_pairs = unique(zipped)
+ unique_urls = [pair[1] for pair in unique_pairs]
+ unique_package_names = [pair[2] for pair in unique_pairs]
+
+ package_url_dict = Dict{String, Vector{String}}()
+ for (url, package_name) in zip(unique_urls, unique_package_names)
+ if haskey(package_url_dict, package_name)
+ # If the package_name is already a key, append the url to the existing array
+ push!(package_url_dict[package_name], url)
+ else
+ # Otherwise, create a new entry with the package_name and the url
+ package_url_dict[package_name] = [url]
+ end
+ end
+ return package_url_dict
+end
diff --git a/test/crawl.jl b/test/crawl.jl
new file mode 100644
index 0000000..6b00ca4
--- /dev/null
+++ b/test/crawl.jl
@@ -0,0 +1,7 @@
+using DocsScraper: crawl
+
+@testset "crawl" begin
+ urls = Vector{AbstractString}(["https://docs.julialang.org/en/v1/"])
+ hostname_url_dict = crawl(urls)
+ @test length(hostname_url_dict) > 0
+end
diff --git a/test/make_knowledge_packs.jl b/test/make_knowledge_packs.jl
new file mode 100644
index 0000000..5690725
--- /dev/null
+++ b/test/make_knowledge_packs.jl
@@ -0,0 +1,8 @@
+using DocsScraper: process_paths
+
+@testset "overall test" begin
+ url = "https://docs.julialang.org/en/v1/"
+ chunks, sources = process_paths(url)
+ @test length(chunks) > 0 && length(sources) > 0 && chunks[1] != nothing &&
+ sources[1] != nothing
+end
diff --git a/test/parser.jl b/test/parser.jl
new file mode 100644
index 0000000..0faeb04
--- /dev/null
+++ b/test/parser.jl
@@ -0,0 +1,11 @@
+using DocsScraper: parse_url_to_blocks, roll_up_chunks
+
+@testset "parse & roll_up" begin
+ url = "https://docs.julialang.org/en/v1/"
+ parsed_blocks = parse_url_to_blocks(url)
+ @test length(parsed_blocks) > 0
+ SEP = ""
+ docs_, sources_ = roll_up_chunks(parsed_blocks, url; separator = SEP)
+ @test length(docs_) > 0 && length(sources_) > 0 && docs_[1] != nothing &&
+ sources_[1] != nothing
+end
diff --git a/test/runtests.jl b/test/runtests.jl
index 4b4a92c..6e1e7e8 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -1,51 +1,13 @@
-
+using DocsScraper
using Test
-using HTTP, Gumbo, AbstractTrees, URIs
-using Gumbo: HTMLDocument, HTMLElement
-using EzXML
-using PromptingTools
-const PT = PromptingTools
-const RT = PromptingTools.Experimental.RAGTools
-using LinearAlgebra, Unicode, SparseArrays
-using HDF5
-using Tar
-using Inflate
-using SHA
-using Serialization, URIs
-
-include(joinpath("..", "src", "crawl.jl"))
-include(joinpath("..", "src", "extract_urls.jl"))
-include(joinpath("..", "src", "parser.jl"))
-include(joinpath("..", "src", "preparation.jl"))
-include(joinpath("..", "src", "user_preferences.jl"))
-include(joinpath("..", "src", "utils.jl"))
-
-urls = Vector{AbstractString}(["https://docs.julialang.org/en/v1/"])
-url = urls[1]
-queue = Vector{AbstractString}()
-
-@testset "HTTP" begin
- @test HTTP.get(url) != nothing
- result, sitemap_queue = check_robots_txt("*", url)
- @test result == true
-end
-
-@testset "crawl" begin
- hostname_url_dict = crawl(urls)
- @test length(hostname_url_dict) > 0
-end
+using Aqua
-@testset "parse & roll_up" begin
- parsed_blocks = parse_url_to_blocks(url)
- @test length(parsed_blocks) > 0
- SEP = ""
- docs_, sources_ = roll_up_chunks(parsed_blocks, url; separator = SEP)
- @test length(docs_) > 0 && length(sources_) > 0 && docs_[1] != nothing &&
- sources_[1] != nothing
-end
+@testset "DocsScraper.jl" begin
+ @testset "Code quality (Aqua.jl)" begin
+ Aqua.test_all(DocsScraper; persistent_tasks = false)
+ end
-@testset "overall test" begin
- chunks, sources = process_paths(url)
- @test length(chunks) > 0 && length(sources) > 0 && chunks[1] != nothing &&
- sources[1] != nothing
+ include("crawl.jl")
+ include("parser.jl")
+ include("make_knowledge_packs.jl")
end
diff --git a/test/utils.jl b/test/utils.jl
new file mode 100644
index 0000000..fbe338a
--- /dev/null
+++ b/test/utils.jl
@@ -0,0 +1,10 @@
+using DocsScraper: parse_url_to_blocks, roll_up_chunks
+
+@testset "parse & roll_up" begin
+ parsed_blocks = parse_url_to_blocks(url)
+ @test length(parsed_blocks) > 0
+ SEP = ""
+ docs_, sources_ = roll_up_chunks(parsed_blocks, url; separator = SEP)
+ @test length(docs_) > 0 && length(sources_) > 0 && docs_[1] != nothing &&
+ sources_[1] != nothing
+end
From 965873abf5b48f7bac6e99036d6b23a79dd54985 Mon Sep 17 00:00:00 2001
From: Shreyas Agrawal <48771895+splendidbug@users.noreply.github.com>
Date: Thu, 15 Aug 2024 19:33:35 -0700
Subject: [PATCH 6/7] create a single index file
---
Project.toml | 6 +-
src/DocsScraper.jl | 1 +
src/make_knowledge_packs.jl | 140 +++++++++++++++++++++---------------
3 files changed, 87 insertions(+), 60 deletions(-)
diff --git a/Project.toml b/Project.toml
index 1fb77c2..ef5aaa9 100644
--- a/Project.toml
+++ b/Project.toml
@@ -14,6 +14,7 @@ Inflate = "d25df0c9-e2be-5dd7-82c8-3ad0b3e990b9"
JSON = "682c06a0-de6a-54ab-a142-c8b1cf79cde6"
LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
PromptingTools = "670122d1-24a8-4d70-bfce-740807c42192"
+Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
SHA = "ea8e919c-243c-51af-8825-aaa63cd721ce"
Serialization = "9e88b42a-f829-5b0c-bbe9-9e923198166b"
SparseArrays = "2f01184e-e22b-5df5-ae63-d93ebab69eaf"
@@ -30,8 +31,10 @@ Gumbo = "0.8"
HDF5 = "0.17"
HTTP = "1.10"
Inflate = "0.1"
+JSON = "0.21"
LinearAlgebra = "1"
-PromptingTools = "0.48"
+PromptingTools = "0.49"
+Random = "1"
SHA = "0.7"
Serialization = "1"
SparseArrays = "1"
@@ -40,7 +43,6 @@ Test = "1"
URIs = "1.5"
Unicode = "1"
julia = "1.10"
-JSON = "0.21"
[extras]
Aqua = "4c88cf16-eb10-579e-8560-4a9242c79595"
diff --git a/src/DocsScraper.jl b/src/DocsScraper.jl
index 7f114d9..0a65d57 100644
--- a/src/DocsScraper.jl
+++ b/src/DocsScraper.jl
@@ -13,6 +13,7 @@ using SHA
using Serialization, URIs
using Dates
using JSON
+using Random
include("parser.jl")
include("crawl.jl")
diff --git a/src/make_knowledge_packs.jl b/src/make_knowledge_packs.jl
index 5d56ff8..a787edf 100644
--- a/src/make_knowledge_packs.jl
+++ b/src/make_knowledge_packs.jl
@@ -38,7 +38,6 @@ Parse URLs from hostname_url_dict and save the chunks
function make_chunks(hostname_url_dict::Dict{AbstractString, Vector{AbstractString}},
knowledge_pack_path::String; max_chunk_size::Int = MAX_CHUNK_SIZE,
min_chunk_size::Int = MIN_CHUNK_SIZE)
- SAVE_CHUNKS = true
for (hostname, urls) in hostname_url_dict
output_chunks = Vector{SubString{String}}()
output_sources = Vector{String}()
@@ -52,16 +51,14 @@ function make_chunks(hostname_url_dict::Dict{AbstractString, Vector{AbstractStri
@error "error!! check url: $url"
end
end
- if SAVE_CHUNKS
- serialize(
- joinpath(knowledge_pack_path,
- "$(hostname)-chunks-max-$(max_chunk_size)-min-$(min_chunk_size).jls"),
- output_chunks)
- serialize(
- joinpath(knowledge_pack_path,
- "$(hostname)-sources-max-$(max_chunk_size)-min-$(min_chunk_size).jls"),
- output_sources)
- end
+ serialize(
+ joinpath(knowledge_pack_path,
+ "$(hostname)-chunks-max-$(max_chunk_size)-min-$(min_chunk_size).jls"),
+ output_chunks)
+ serialize(
+ joinpath(knowledge_pack_path,
+ "$(hostname)-sources-max-$(max_chunk_size)-min-$(min_chunk_size).jls"),
+ output_sources)
end
end
@@ -87,19 +84,24 @@ end
"""
generate_embeddings(knowledge_pack_path::String; model::AbstractString=MODEL,
- embedding_size::Int=EMBEDDING_SIZE, custom_metadata::AbstractString)
+ embedding_size::Int=EMBEDDING_SIZE, custom_metadata::AbstractString,
+ bool_embeddings::Bool = true, index_name::AbstractString = "")
Deserialize chunks and sources to generate embeddings
+Note: We highly recommend to pass `index_name`. This will be the name of the generated index. Default: date-randomInt
# Arguments
- model: Embedding model
- embedding_size: Embedding dimensions
- custom_metadata: Custom metadata like ecosystem name if required
+- bool_embeddings: If true, embeddings generated will be boolean, Float32 otherwise
+- index_name: Name if the index. Default: date-randomInt
"""
function generate_embeddings(
knowledge_pack_path::String; max_chunk_size::Int = MAX_CHUNK_SIZE,
model::AbstractString = MODEL,
- embedding_size::Int = EMBEDDING_SIZE, custom_metadata::AbstractString)
+ embedding_size::Int = EMBEDDING_SIZE, custom_metadata::AbstractString,
+ bool_embeddings::Bool = true, index_name::AbstractString = "")
embedder = RT.BatchEmbedder()
entries = readdir(knowledge_pack_path)
# Initialize a dictionary to group files by hostname and chunk size
@@ -109,9 +111,6 @@ function generate_embeddings(
chunks_pattern = r"^(.*)-chunks-max-(\d+)-min-(\d+)\.jls$"
sources_pattern = r"^(.*)-sources-max-(\d+)-min-(\d+)\.jls$"
- # chunks_pattern = r"^(.*)-chunks-(\d+)\.jls$"
- # sources_pattern = r"^(.*)-sources-(\d+)\.jls$"
-
# Group files by hostname and chunk size
for file in entries
match_chunks = match(chunks_pattern, file)
@@ -141,62 +140,83 @@ function generate_embeddings(
knowledge_pack_path, file)
end
end
- # Process each pair of files
+
+ chunks = Vector{SubString{String}}()
+ sources = Vector{String}()
+
+ # Add chunks and sources to vectors from each of the scraped file
for (hostname, chunk_files) in hostname_files
for (max_chunk_size, files) in chunk_files
if haskey(files, "chunks") && haskey(files, "sources")
chunks_file = files["chunks"]
sources_file = files["sources"]
- chunks = deserialize(chunks_file)
- sources = deserialize(sources_file)
- cost_tracker = Threads.Atomic{Float64}(0.0)
- full_embeddings = RT.get_embeddings(
- embedder, chunks; model, verbose = false, cost_tracker)
- @info "Created embeddings for $hostname. Cost: \$$(round(cost_tracker[], digits=3))"
-
- trunc = embedding_size < EMBEDDING_SIZE ? 1 : 0
- fn_output = joinpath(knowledge_pack_path, "packs",
- "$hostname-$model-$trunc-Float32__v1.0.tar.gz")
- fn_temp = joinpath(knowledge_pack_path, "packs",
- "$hostname-$model-$trunc-Float32__v1.0.hdf5")
-
- h5open(fn_temp, "w") do file
- file["chunks"] = chunks
- file["sources"] = sources
- file["embeddings"] = full_embeddings[1:embedding_size, :] |>
- l2_norm_columns |> x -> map(>(0), x)
- file["type"] = "ChunkIndex"
-
- package_url_dict = Dict{String, Vector{String}}()
- package_url_dict = urls_for_metadata(sources)
-
- metadata = Dict(
- :embedded_dt => Dates.today(),
- :custom_metadata => custom_metadata, :max_chunk_size => max_chunk_size,
- :embedding_size => embedding_size, :model => model,
- :packages => package_url_dict)
-
- metadata_json = JSON.json(metadata)
- file["metadata"] = metadata_json
- end
-
- command = `tar -cvzf $fn_output -C $(dirname(fn_temp)) $(basename(fn_temp))`
- run(command)
- report_artifact(fn_output)
-
+ append!(chunks, deserialize(chunks_file))
+ append!(sources, deserialize(sources_file))
else
@warn "Missing pair for hostname: $hostname, max chunk size: $max_chunk_size"
end
end
end
+
+ # Generate embeddings
+ cost_tracker = Threads.Atomic{Float64}(0.0)
+ full_embeddings = RT.get_embeddings(
+ embedder, chunks; model, verbose = false, cost_tracker)
+
+ full_embeddings = full_embeddings[1:embedding_size, :] |>
+ l2_norm_columns
+
+ if bool_embeddings
+ full_embeddings = map(>(0), full_embeddings)
+ end
+
+ if isempty(index_name)
+ rand_int = rand(1000:100000)
+ date = Dates.today()
+ index_name = "$(date)-$(rand_int)"
+ end
+
+ @info "Created embeddings for $index_name. Cost: \$$(round(cost_tracker[], digits=3))"
+
+ trunc = embedding_size < EMBEDDING_SIZE ? 1 : 0
+ emb_data_type = bool_embeddings ? "Bool" : "Float32"
+
+ fn_output = joinpath(knowledge_pack_path, "packs",
+ "$index_name-$model-$trunc-$(emb_data_type)__v1.0.tar.gz")
+ fn_temp = joinpath(knowledge_pack_path, "packs",
+ "$index_name-$model-$trunc-$(emb_data_type)__v1.0.hdf5")
+
+ h5open(fn_temp, "w") do file
+ file["chunks"] = chunks
+ file["sources"] = sources
+ file["embeddings"] = full_embeddings
+ file["type"] = "ChunkIndex"
+
+ package_url_dict = Dict{String, Vector{String}}()
+ package_url_dict = urls_for_metadata(sources)
+
+ metadata = Dict(
+ :embedded_dt => Dates.today(),
+ :custom_metadata => custom_metadata, :max_chunk_size => max_chunk_size,
+ :embedding_size => embedding_size, :model => model,
+ :packages => package_url_dict)
+
+ metadata_json = JSON.json(metadata)
+ file["metadata"] = metadata_json
+ end
+
+ command = `tar -cvzf $fn_output -C $(dirname(fn_temp)) $(basename(fn_temp))`
+ run(command)
+ report_artifact(fn_output)
end
"""
make_knowledge_packs(crawlable_urls::Vector{<:AbstractString}=String[]; single_urls::Vector{<:AbstractString}=String[],
max_chunk_size::Int=MAX_CHUNK_SIZE, min_chunk_size::Int=MIN_CHUNK_SIZE, model::AbstractString=MODEL, embedding_size::Int=EMBEDDING_SIZE,
- custom_metadata::AbstractString)
+ custom_metadata::AbstractString, bool_embeddings::Bool = true, index_name::AbstractString = "")
-Entry point to crawl, parse and generate embeddings
+Entry point to crawl, parse and generate embeddings.
+Note: We highly recommend to pass `index_name`. This will be the name of the generated index. Default: date-randomInt
# Arguments
- crawlable_urls: URLs that should be crawled to find more links
@@ -206,11 +226,14 @@ Entry point to crawl, parse and generate embeddings
- model: Embedding model
- embedding_size: Embedding dimensions
- custom_metadata: Custom metadata like ecosystem name if required
+- bool_embeddings: If true, embeddings generated will be boolean, Float32 otherwise
+- index_name: Name if the index. Default: date-randomInt
"""
function make_knowledge_packs(crawlable_urls::Vector{<:AbstractString} = String[];
single_urls::Vector{<:AbstractString} = String[],
max_chunk_size::Int = MAX_CHUNK_SIZE, min_chunk_size::Int = MIN_CHUNK_SIZE,
- model::AbstractString = MODEL, embedding_size::Int = EMBEDDING_SIZE, custom_metadata::AbstractString = "")
+ model::AbstractString = MODEL, embedding_size::Int = EMBEDDING_SIZE, custom_metadata::AbstractString = "",
+ bool_embeddings::Bool = true, index_name::AbstractString = "")
if isempty(crawlable_urls) && isempty(single_urls)
error("At least one of `input_urls` or `single_pages` must be provided.")
end
@@ -241,5 +264,6 @@ function make_knowledge_packs(crawlable_urls::Vector{<:AbstractString} = String[
make_chunks(
hostname_url_dict, knowledge_pack_path; max_chunk_size, min_chunk_size)
generate_embeddings(
- knowledge_pack_path; max_chunk_size, model, embedding_size, custom_metadata)
+ knowledge_pack_path; max_chunk_size, model, embedding_size,
+ custom_metadata, bool_embeddings, index_name)
end
From b2c629ab64599da6be4e0f08b46d2e5567fbd8a6 Mon Sep 17 00:00:00 2001
From: Shreyas Shirish Agrawal <48771895+splendidbug@users.noreply.github.com>
Date: Thu, 15 Aug 2024 20:19:03 -0700
Subject: [PATCH 7/7] Update Project.toml
---
Project.toml | 1 -
1 file changed, 1 deletion(-)
diff --git a/Project.toml b/Project.toml
index 3cbe2e5..ef5aaa9 100644
--- a/Project.toml
+++ b/Project.toml
@@ -43,7 +43,6 @@ Test = "1"
URIs = "1.5"
Unicode = "1"
julia = "1.10"
-JSON = "0.21"
[extras]
Aqua = "4c88cf16-eb10-579e-8560-4a9242c79595"