From ed41636c855fb24cedbc7b5a99b34a089369aecc Mon Sep 17 00:00:00 2001 From: Shreyas Agrawal <48771895+splendidbug@users.noreply.github.com> Date: Wed, 10 Jul 2024 11:49:50 -0700 Subject: [PATCH 1/7] structured according to PkgTemplate --- .github/workflows/CI.yml | 81 ++++++++++++++++++++++++++++++++++++++++ MIT | 21 +++++++++++ Project.toml | 3 ++ docs/Project.toml | 20 ++++++++++ docs/make.jl | 24 ++++++++++++ docs/src/index.md | 4 ++ src/make_embeddings.jl | 18 +++++++-- test/runtests.jl | 30 +++++++++++---- 8 files changed, 190 insertions(+), 11 deletions(-) create mode 100644 .github/workflows/CI.yml create mode 100644 MIT create mode 100644 docs/Project.toml create mode 100644 docs/make.jl create mode 100644 docs/src/index.md diff --git a/.github/workflows/CI.yml b/.github/workflows/CI.yml new file mode 100644 index 0000000..371b418 --- /dev/null +++ b/.github/workflows/CI.yml @@ -0,0 +1,81 @@ +name: CI +on: + push: + branches: + - main + tags: ['*'] + pull_request: + workflow_dispatch: +concurrency: + # Skip intermediate builds: always. + # Cancel intermediate builds: only if it is a pull request build. + group: ${{ github.workflow }}-${{ github.ref }} + cancel-in-progress: ${{ startsWith(github.ref, 'refs/pull/') }} +jobs: + test: + name: Julia ${{ matrix.version }} - ${{ matrix.os }} - ${{ matrix.arch }} - ${{ github.event_name }} + runs-on: ${{ matrix.os }} + timeout-minutes: 60 + permissions: # needed to allow julia-actions/cache to proactively delete old caches that it has created + actions: write + contents: read + strategy: + fail-fast: false + matrix: + version: + <<#VERSIONS>> + - '<<&.>>' + <> + os: + - ubuntu-latest + arch: + - x64 + steps: + - uses: actions/checkout@v4 + - uses: julia-actions/setup-julia@v2 + with: + version: ${{ matrix.version }} + arch: ${{ matrix.arch }} + - uses: julia-actions/cache@v2 + - uses: julia-actions/julia-buildpkg@v1 + - uses: julia-actions/julia-runtest@v1 + - uses: julia-actions/julia-processcoverage@v1 + - uses: codecov/codecov-action@v4 + with: + files: lcov.info + token: ${{ secrets.CODECOV_TOKEN }} + fail_ci_if_error: false + docs: + name: Documentation + runs-on: ubuntu-latest + permissions: + actions: write # needed to allow julia-actions/cache to proactively delete old caches that it has created + contents: write + statuses: write + pages: write + id-token: write + steps: + - uses: actions/checkout@v4 + - uses: julia-actions/setup-julia@v2 + with: + version: '1' + - uses: julia-actions/cache@v2 + - name: Configure doc environment + shell: julia --project=docs --color=yes {0} + run: | + using Pkg + Pkg.develop(PackageSpec(path=pwd())) + Pkg.instantiate() + - uses: julia-actions/julia-buildpkg@v1 + - uses: julia-actions/julia-docdeploy@v1 + env: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + DOCUMENTER_KEY: ${{ secrets.DOCUMENTER_KEY }} + - name: Run doctests + shell: julia --project=docs --color=yes {0} + run: | + using Documenter: DocMeta, doctest + using <<&PKG>> + DocMeta.setdocmeta!(<<&PKG>>, :DocTestSetup, :(using <<&PKG>>); recursive=true) + doctest(<<&PKG>>) + <> diff --git a/MIT b/MIT new file mode 100644 index 0000000..775ba1d --- /dev/null +++ b/MIT @@ -0,0 +1,21 @@ +MIT License + +Copyright (c) {{{YEAR}}} {{{AUTHORS}}} + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. diff --git a/Project.toml b/Project.toml index 964d069..0c1f6a8 100644 --- a/Project.toml +++ b/Project.toml @@ -11,7 +11,9 @@ Gumbo = "708ec375-b3d6-5a57-a7ce-8257bf98657a" HDF5 = "f67ccb44-e63f-5c2f-98bd-6dc0ccc4ba2f" HTTP = "cd3eb016-35fb-5094-929b-558a96fad6f3" Inflate = "d25df0c9-e2be-5dd7-82c8-3ad0b3e990b9" +PkgTemplates = "14b8a8f1-9102-5b29-a752-f990bacb7fe1" PromptingTools = "670122d1-24a8-4d70-bfce-740807c42192" +Tar = "a4e569a6-e804-4fa4-b0f3-eef7a1d5b13e" URIParser = "30578b45-9adc-5946-b283-645ec420af67" URIs = "5c2747f8-b7ea-4ff2-ba2e-563bfd36b1d4" @@ -19,4 +21,5 @@ URIs = "5c2747f8-b7ea-4ff2-ba2e-563bfd36b1d4" AbstractTrees = "0.4.5" Gumbo = "0.8.2" HTTP = "1.10.4" +PromptingTools = "0.36.0" URIs = "1.5.1" diff --git a/docs/Project.toml b/docs/Project.toml new file mode 100644 index 0000000..6fea155 --- /dev/null +++ b/docs/Project.toml @@ -0,0 +1,20 @@ +[deps] +AbstractTrees = "1520ce14-60c1-5f80-bbc7-55ef81b5835c" +DotEnv = "4dc1fcf4-5e3b-5448-94ab-0c38ec0385c1" +EzXML = "8f5d6c58-4d21-5cfd-889c-e3ad7ee6a615" +Gumbo = "708ec375-b3d6-5a57-a7ce-8257bf98657a" +HDF5 = "f67ccb44-e63f-5c2f-98bd-6dc0ccc4ba2f" +HTTP = "cd3eb016-35fb-5094-929b-558a96fad6f3" +Inflate = "d25df0c9-e2be-5dd7-82c8-3ad0b3e990b9" +PkgTemplates = "14b8a8f1-9102-5b29-a752-f990bacb7fe1" +PromptingTools = "670122d1-24a8-4d70-bfce-740807c42192" +Tar = "a4e569a6-e804-4fa4-b0f3-eef7a1d5b13e" +URIParser = "30578b45-9adc-5946-b283-645ec420af67" +URIs = "5c2747f8-b7ea-4ff2-ba2e-563bfd36b1d4" + +[compat] +AbstractTrees = "0.4.5" +Gumbo = "0.8.2" +HTTP = "1.10.4" +PromptingTools = "0.36.0" +URIs = "1.5.1" diff --git a/docs/make.jl b/docs/make.jl new file mode 100644 index 0000000..38d4452 --- /dev/null +++ b/docs/make.jl @@ -0,0 +1,24 @@ +using Documenter: Documenter, makedocs, deploydocs +using PkgTemplates: PkgTemplates + +makedocs(; + modules=[PkgTemplates], + authors="Shreyas Agrawal <48771895+splendidbug@users.noreply.github.com>", + repo="https://github.com/splendidbug/RAGKit", + sitename="RAGKit.jl", + # format=Documenter.HTML(; + # repolink="https://github.com/splendidbug/RAGKit", + # canonical="https://juliaci.github.io/PkgTemplates.jl", + # assets=String[], + # ), + pages=[ + "Home" => "index.md", + "User Guide" => "user.md", + "Developer Guide" => "developer.md", + "Migrating To PkgTemplates 0.7+" => "migrating.md", + ], +) + +deploydocs(; + repo="https://github.com/splendidbug/RAGKit", +) diff --git a/docs/src/index.md b/docs/src/index.md new file mode 100644 index 0000000..f53a411 --- /dev/null +++ b/docs/src/index.md @@ -0,0 +1,4 @@ +# RAGKit + +## Documentation + diff --git a/src/make_embeddings.jl b/src/make_embeddings.jl index ba079aa..f51c865 100644 --- a/src/make_embeddings.jl +++ b/src/make_embeddings.jl @@ -66,6 +66,16 @@ function make_chunks(hostname_url_dict::Dict{AbstractString,Vector{AbstractStrin end +function l2_norm_columns(mat::AbstractMatrix) + norm_ = norm.(eachcol(mat)) + return mat ./ norm_' +end +function l2_norm_columns(vect::AbstractVector) + norm_ = norm(vect) + return vect / norm_ +end + + """ generate_embeddings() @@ -120,19 +130,19 @@ function generate_embeddings(knowledge_pack_path::String) chunks = deserialize(chunks_file) sources = deserialize(sources_file) cost_tracker = Threads.Atomic{Float64}(0.0) - full_embeddings = RT.get_embeddings(embedder, chunks; model="text-embedding-3-large", verbose=false, cost_tracker, api_key=ENV["OPENAI_API_KEY"]) + full_embeddings = RT.get_embeddings(embedder, chunks; model="text-embedding-3-large", verbose=false, cost_tracker, dimensions=1024) - # Float32 fn_output = joinpath(knowledge_pack_path, "packs", "$hostname-textembedding3large-0-Float32__v1.0.tar.gz") fn_temp = joinpath(knowledge_pack_path, "packs", "pack.hdf5") h5open(fn_temp, "w") do file file["chunks"] = chunks file["sources"] = sources - file["embeddings"] = full_embeddings + file["embeddings"] = full_embeddings[1:1024, :] |> l2_norm_columns |> x -> map(>(0), x) file["type"] = "ChunkIndex" # file["metadata"] = "$hostname ecosystem docstrings, chunk size $chunk_size, downloaded on 20240330, contains: Makie.jl, AlgebraOfGraphics.jl, GeoMakie.jl, GraphMakie.jl, MakieThemes.jl, TopoPlots.jl, Tyler.jl" end - run(tar - cvzf$fn_output - C$(dirname(fn_temp))$(basename(fn_temp))) + command = `tar -cvzf $fn_output -C $(dirname(fn_temp)) $(basename(fn_temp))` + run(command) report_artifact(fn_output) else diff --git a/test/runtests.jl b/test/runtests.jl index fdde81f..78a78b4 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -1,21 +1,37 @@ using Test +using HTTP, Gumbo, AbstractTrees, URIs +using Gumbo: HTMLDocument, HTMLElement +using EzXML +using PromptingTools +const PT = PromptingTools +const RT = PromptingTools.Experimental.RAGTools +using LinearAlgebra, Unicode, SparseArrays +using HDF5 +using Tar +using Inflate + +using SHA +using Serialization, URIs + +include("..\\src\\crawl.jl") +include("..\\src\\extract_urls.jl") +include("..\\src\\parser.jl") +include("..\\src\\preparation.jl") urls = Vector{AbstractString}(["https://docs.julialang.org/en/v1/"]) url = urls[1] queue = Vector{AbstractString}() @testset "check robots.txt" begin + @test HTTP.get(url) != nothing + result, sitemap_queue = check_robots_txt("*", url) @test result == true end -@testset "HTTP get" begin - @test HTTP.get(url) != nothing -end - -@testset "get_urls!" begin - get_urls!(url, queue) - @test length(queue) > 1 +@testset "crawl" begin + hostname_url_dict = crawl(urls) + @test length(hostname_url_dict) > 0 end @testset "parse & roll_up" begin From 6dc7aa3f7033959ae6730c250d8fba34a323c2fd Mon Sep 17 00:00:00 2001 From: Shreyas Agrawal <48771895+splendidbug@users.noreply.github.com> Date: Sun, 11 Aug 2024 02:42:14 -0700 Subject: [PATCH 2/7] structured according to PkgTemplate, other changes --- .JuliaFormatter.toml | 2 + .github/dependabot.yml | 7 + .github/workflows/CI.yml | 18 +-- .github/workflows/CompatHelper.yml | 16 +++ .github/workflows/TagBot.yml | 31 ++++ .gitignore | 5 +- .vscode/settings.json | 6 + MIT => LICENSE | 2 +- Project.toml | 20 ++- docs/Project.toml | 1 - docs/make.jl | 23 ++- docs/src/index.md | 2 +- src/{RAGKit.jl => DocsScraper.jl} | 15 +- src/crawl.jl | 32 ++--- src/extract_urls.jl | 95 ++++++------ src/make_embeddings.jl | 173 ---------------------- src/make_knowledge_packs.jl | 222 +++++++++++++++++++++++++++++ src/parser.jl | 160 +++++++-------------- src/preparation.jl | 58 +++++--- src/user_preferences.jl | 4 + src/utils.jl | 76 ++++++++-- test/runtests.jl | 24 ++-- 22 files changed, 557 insertions(+), 435 deletions(-) create mode 100644 .JuliaFormatter.toml create mode 100644 .github/dependabot.yml create mode 100644 .github/workflows/CompatHelper.yml create mode 100644 .github/workflows/TagBot.yml create mode 100644 .vscode/settings.json rename MIT => LICENSE (94%) rename src/{RAGKit.jl => DocsScraper.jl} (56%) delete mode 100644 src/make_embeddings.jl create mode 100644 src/make_knowledge_packs.jl create mode 100644 src/user_preferences.jl diff --git a/.JuliaFormatter.toml b/.JuliaFormatter.toml new file mode 100644 index 0000000..5657bd0 --- /dev/null +++ b/.JuliaFormatter.toml @@ -0,0 +1,2 @@ +# See https://domluna.github.io/JuliaFormatter.jl/stable/ for a list of options +style = "sciml" diff --git a/.github/dependabot.yml b/.github/dependabot.yml new file mode 100644 index 0000000..700707c --- /dev/null +++ b/.github/dependabot.yml @@ -0,0 +1,7 @@ +# https://docs.github.com/github/administering-a-repository/configuration-options-for-dependency-updates +version: 2 +updates: + - package-ecosystem: "github-actions" + directory: "/" # Location of package manifests + schedule: + interval: "weekly" diff --git a/.github/workflows/CI.yml b/.github/workflows/CI.yml index 371b418..874943f 100644 --- a/.github/workflows/CI.yml +++ b/.github/workflows/CI.yml @@ -3,7 +3,7 @@ on: push: branches: - main - tags: ['*'] + tags: ["*"] pull_request: workflow_dispatch: concurrency: @@ -23,9 +23,8 @@ jobs: fail-fast: false matrix: version: - <<#VERSIONS>> - - '<<&.>>' - <> + - "1.10" + - "nightly" os: - ubuntu-latest arch: @@ -52,13 +51,11 @@ jobs: actions: write # needed to allow julia-actions/cache to proactively delete old caches that it has created contents: write statuses: write - pages: write - id-token: write steps: - uses: actions/checkout@v4 - uses: julia-actions/setup-julia@v2 with: - version: '1' + version: "1" - uses: julia-actions/cache@v2 - name: Configure doc environment shell: julia --project=docs --color=yes {0} @@ -75,7 +72,6 @@ jobs: shell: julia --project=docs --color=yes {0} run: | using Documenter: DocMeta, doctest - using <<&PKG>> - DocMeta.setdocmeta!(<<&PKG>>, :DocTestSetup, :(using <<&PKG>>); recursive=true) - doctest(<<&PKG>>) - <> + using DocsScraper + DocMeta.setdocmeta!(DocsScraper, :DocTestSetup, :(using DocsScraper); recursive=true) + doctest(DocsScraper) diff --git a/.github/workflows/CompatHelper.yml b/.github/workflows/CompatHelper.yml new file mode 100644 index 0000000..d48734a --- /dev/null +++ b/.github/workflows/CompatHelper.yml @@ -0,0 +1,16 @@ +name: CompatHelper +on: + schedule: + - cron: 0 0 1 * * + workflow_dispatch: +jobs: + CompatHelper: + runs-on: ubuntu-latest + steps: + - name: Pkg.add("CompatHelper") + run: julia -e 'using Pkg; Pkg.add("CompatHelper")' + - name: CompatHelper.main() + env: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + COMPATHELPER_PRIV: ${{ secrets.DOCUMENTER_KEY }} + run: julia -e 'using CompatHelper; CompatHelper.main()' diff --git a/.github/workflows/TagBot.yml b/.github/workflows/TagBot.yml new file mode 100644 index 0000000..0cd3114 --- /dev/null +++ b/.github/workflows/TagBot.yml @@ -0,0 +1,31 @@ +name: TagBot +on: + issue_comment: + types: + - created + workflow_dispatch: + inputs: + lookback: + default: "3" +permissions: + actions: read + checks: read + contents: write + deployments: read + issues: read + discussions: read + packages: read + pages: read + pull-requests: read + repository-projects: read + security-events: read + statuses: read +jobs: + TagBot: + if: github.event_name == 'workflow_dispatch' || github.actor == 'JuliaTagBot' + runs-on: ubuntu-latest + steps: + - uses: JuliaRegistries/TagBot@v1 + with: + token: ${{ secrets.GITHUB_TOKEN }} + ssh: ${{ secrets.DOCUMENTER_KEY }} diff --git a/.gitignore b/.gitignore index 9c929a1..8e2d4ba 100644 --- a/.gitignore +++ b/.gitignore @@ -1,4 +1,7 @@ # Ignore .env files .env knowledge_packs/ -Manifest.toml \ No newline at end of file +Manifest.toml +/Manifest.toml +/docs/Manifest.toml +/docs/build/ \ No newline at end of file diff --git a/.vscode/settings.json b/.vscode/settings.json new file mode 100644 index 0000000..9238ca7 --- /dev/null +++ b/.vscode/settings.json @@ -0,0 +1,6 @@ +{ + "cSpell.words": [ + "eachmatch", + "postprocess" + ] +} diff --git a/MIT b/LICENSE similarity index 94% rename from MIT rename to LICENSE index 775ba1d..d7bd022 100644 --- a/MIT +++ b/LICENSE @@ -1,6 +1,6 @@ MIT License -Copyright (c) {{{YEAR}}} {{{AUTHORS}}} +Copyright (c) Shreyas Agrawal @splendidbug and J S @svilupp Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal diff --git a/Project.toml b/Project.toml index 0c1f6a8..705a918 100644 --- a/Project.toml +++ b/Project.toml @@ -1,17 +1,15 @@ -name = "RAGKit" -uuid = "74e640d8-05f4-4b4f-8742-56fc934b3f17" -authors = ["Shreyas Agrawal <48771895+splendidbug@users.noreply.github.com>"] +name = "DocsScraper" +uuid = "bd71d052-5e08-40cc-a492-eb4e8da4b649" +authors = ["Shreyas Agrawal @splendidbug and J S @svilupp"] version = "0.1.0" [deps] AbstractTrees = "1520ce14-60c1-5f80-bbc7-55ef81b5835c" -DotEnv = "4dc1fcf4-5e3b-5448-94ab-0c38ec0385c1" EzXML = "8f5d6c58-4d21-5cfd-889c-e3ad7ee6a615" Gumbo = "708ec375-b3d6-5a57-a7ce-8257bf98657a" HDF5 = "f67ccb44-e63f-5c2f-98bd-6dc0ccc4ba2f" HTTP = "cd3eb016-35fb-5094-929b-558a96fad6f3" Inflate = "d25df0c9-e2be-5dd7-82c8-3ad0b3e990b9" -PkgTemplates = "14b8a8f1-9102-5b29-a752-f990bacb7fe1" PromptingTools = "670122d1-24a8-4d70-bfce-740807c42192" Tar = "a4e569a6-e804-4fa4-b0f3-eef7a1d5b13e" URIParser = "30578b45-9adc-5946-b283-645ec420af67" @@ -19,7 +17,19 @@ URIs = "5c2747f8-b7ea-4ff2-ba2e-563bfd36b1d4" [compat] AbstractTrees = "0.4.5" +EzXML = "1.2.0" Gumbo = "0.8.2" +HDF5 = "0.17.2" HTTP = "1.10.4" +Inflate = "0.1.5" PromptingTools = "0.36.0" +URIParser = "0.4.1" URIs = "1.5.1" +Tar = "1.10.0" + +[extras] +Aqua = "4c88cf16-eb10-579e-8560-4a9242c79595" +Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40" + +[targets] +test = ["Aqua", "Test"] diff --git a/docs/Project.toml b/docs/Project.toml index 6fea155..41b0b18 100644 --- a/docs/Project.toml +++ b/docs/Project.toml @@ -6,7 +6,6 @@ Gumbo = "708ec375-b3d6-5a57-a7ce-8257bf98657a" HDF5 = "f67ccb44-e63f-5c2f-98bd-6dc0ccc4ba2f" HTTP = "cd3eb016-35fb-5094-929b-558a96fad6f3" Inflate = "d25df0c9-e2be-5dd7-82c8-3ad0b3e990b9" -PkgTemplates = "14b8a8f1-9102-5b29-a752-f990bacb7fe1" PromptingTools = "670122d1-24a8-4d70-bfce-740807c42192" Tar = "a4e569a6-e804-4fa4-b0f3-eef7a1d5b13e" URIParser = "30578b45-9adc-5946-b283-645ec420af67" diff --git a/docs/make.jl b/docs/make.jl index 38d4452..a54f0f6 100644 --- a/docs/make.jl +++ b/docs/make.jl @@ -1,24 +1,23 @@ -using Documenter: Documenter, makedocs, deploydocs -using PkgTemplates: PkgTemplates +using DocsScraper +using Documenter + +DocMeta.setdocmeta!(DocsScraper, :DocTestSetup, :(using DocsScraper); recursive=true) makedocs(; - modules=[PkgTemplates], - authors="Shreyas Agrawal <48771895+splendidbug@users.noreply.github.com>", - repo="https://github.com/splendidbug/RAGKit", - sitename="RAGKit.jl", + modules=[DocsScraper], + authors="Shreyas Agrawal @splendidbug and J S @svilupp", + sitename="DocsScraper.jl", # format=Documenter.HTML(; - # repolink="https://github.com/splendidbug/RAGKit", - # canonical="https://juliaci.github.io/PkgTemplates.jl", + # canonical="https://Shreyas Agrawal.github.io/DocsScraper.jl", + # edit_link="master", # assets=String[], # ), pages=[ "Home" => "index.md", - "User Guide" => "user.md", - "Developer Guide" => "developer.md", - "Migrating To PkgTemplates 0.7+" => "migrating.md", ], ) deploydocs(; - repo="https://github.com/splendidbug/RAGKit", + repo="github.com/Shreyas Agrawal/DocsScraper.jl", + devbranch="main", ) diff --git a/docs/src/index.md b/docs/src/index.md index f53a411..a6f0129 100644 --- a/docs/src/index.md +++ b/docs/src/index.md @@ -1,4 +1,4 @@ -# RAGKit +# DocsScraper ## Documentation diff --git a/src/RAGKit.jl b/src/DocsScraper.jl similarity index 56% rename from src/RAGKit.jl rename to src/DocsScraper.jl index b895363..e78dde7 100644 --- a/src/RAGKit.jl +++ b/src/DocsScraper.jl @@ -1,7 +1,9 @@ -module RAGKit +module DocsScraper using HTTP, Gumbo, AbstractTrees, URIs using Gumbo: HTMLDocument, HTMLElement using EzXML +using Pkg +Pkg.develop(PackageSpec(path="C:\\Users\\shrey\\Desktop\\stuff\\assignments\\grad\\projects\\Julia\\PromptingTools.jl")) using PromptingTools const PT = PromptingTools const RT = PromptingTools.Experimental.RAGTools @@ -12,17 +14,18 @@ using Inflate using SHA using Serialization, URIs -# using Regex - -# using Robots include("parser.jl") include("crawl.jl") include("extract_urls.jl") include("preparation.jl") -include("make_embeddings.jl") -export make_embeddings +include("make_knowledge_packs.jl") +export make_knowledge_packs, just_generate + +include("user_preferences.jl") +include("utils.jl") +export remove_urls_from_index end \ No newline at end of file diff --git a/src/crawl.jl b/src/crawl.jl index b147511..a8f93c9 100644 --- a/src/crawl.jl +++ b/src/crawl.jl @@ -2,10 +2,7 @@ """ parse_robots_txt!(robots_txt::String) -Parses the robots.txt string and returns rules along with the URLs on Sitemap - -# Arguments -- `robots_txt`: robots.txt as a string +Parse the robots.txt string and return rules and the URLs on Sitemap """ function parse_robots_txt!(robots_txt::String) rules = Dict{String,Dict{String,Vector{String}}}() @@ -40,17 +37,15 @@ end """ - check_robots_txt(user_agent::AbstractString, - url::AbstractString) + check_robots_txt(user_agent::AbstractString, url::AbstractString) -Checks the robots.txt of a URL and returns a boolean representing if `user_agent` is allowed to crawl the input url +Check robots.txt of a URL and return a boolean representing if `user_agent` is allowed to crawl the input url, along with sitemap urls # Arguments - `user_agent`: user agent attempting to crawl the webpage - `url`: input URL string """ -function check_robots_txt(user_agent::AbstractString, - url::AbstractString) +function check_robots_txt(user_agent::AbstractString, url::AbstractString) ## TODO: Make a cache of rules for a quick lookup # if (haskey(restricted_urls, url)) @@ -101,10 +96,7 @@ end """ get_base_url(url::AbstractString) -Extracts the base url. - -# Arguments -- `url`: The url string of which, the base url needs to be extracted +Extract the base url """ function get_base_url(url::AbstractString) @@ -118,10 +110,7 @@ end """ process_hostname(url::AbstractString) -Returns the hostname of an input URL - -# Arguments -- `url`: URL string +Return the hostname of an input URL """ function process_hostname(url::AbstractString) URI = URIs.URI(url) @@ -133,7 +122,7 @@ end """ process_hostname(url::AbstractString, hostname_dict::Dict{AbstractString,Vector{AbstractString}}) -Adds the `url` to it's hostname in `hostname_dict` +Add `url` to its hostname in `hostname_dict` # Arguments - `url`: URL string @@ -154,10 +143,7 @@ end """ crawl(input_urls::Vector{<:AbstractString}) -Crawls on the input URLs and returns a `hostname_url_dict` which is a dictionary with key being hostnames and the values being the URLs - -# Arguments -- `input_urls`: A vector of input URLs +Crawl on the input URLs and return a `hostname_url_dict` which is a dictionary with key being hostnames and the values being the URLs """ function crawl(input_urls::Vector{<:AbstractString}) @@ -187,6 +173,6 @@ function crawl(input_urls::Vector{<:AbstractString}) end end - return hostname_url_dict + return hostname_url_dict, visited_url_set end diff --git a/src/extract_urls.jl b/src/extract_urls.jl index b9ea364..d5e8fcf 100644 --- a/src/extract_urls.jl +++ b/src/extract_urls.jl @@ -1,31 +1,37 @@ -# Temporary until I find a package to simplify this +""" + resolve_url(base_url::String, extracted_url::String) -function resolve_url(base_url::String, relative_url::String)::String - base_uri = URI(base_url) - relative_uri = URI(relative_url) +Check the extracted URL with the original URL. Return empty String if the extracted URL belongs to a different domain. +Return complete URL if there's a directory traversal paths or the extracted URL belongs to the same domain as the base_url - ## TODO: Make a list of allowed URLs which would contain Julia docs hostnames +# Arguments +- base_url: URL of the page from which other URLs are being extracted +- extracted_url: URL extracted from the base_url +""" +function resolve_url(base_url::String, extracted_url::String) + base_uri = URI(base_url) + extracted_uri = URI(extracted_url) ## TODO: Look for version number either on the bottom left dropdown or identify on the url - if length(relative_url) > 4 && relative_url[1:4] == "http" - if base_uri.host == relative_uri.host - return relative_url + if length(extracted_url) > 4 && extracted_url[1:4] == "http" + if base_uri.host == extracted_uri.host + return extracted_url end return "" end - if !isempty(relative_url) && relative_url[1] == '#' + if !isempty(extracted_url) && extracted_url[1] == '#' return "" end - if !isempty(relative_uri.path) && relative_uri.path[1] == '/' + if !isempty(extracted_uri.path) && extracted_uri.path[1] == '/' resolved_uri = URI( - scheme=base_uri.scheme, - userinfo=base_uri.userinfo, - host=base_uri.host, - port=base_uri.port, - path=relative_uri.path, - query=relative_uri.query, - fragment=relative_uri.fragment + scheme = base_uri.scheme, + userinfo = base_uri.userinfo, + host = base_uri.host, + port = base_uri.port, + path = extracted_uri.path, + query = extracted_uri.query, + fragment = extracted_uri.fragment ) return string(resolved_uri) end @@ -34,11 +40,11 @@ function resolve_url(base_url::String, relative_url::String)::String base_segments = split(base_uri.path, "/") base_segments = filter((i) -> i != "", base_segments) - relative_segments = split(relative_uri.path, "/") - relative_segments = filter((i) -> i != "", relative_segments) + extracted_segments = split(extracted_uri.path, "/") + extracted_segments = filter((i) -> i != "", extracted_segments) - # Process the relative segments - for segment in relative_segments + # Process the directory traversal paths + for segment in extracted_segments if segment == ".." if !isempty(base_segments) pop!(base_segments) @@ -53,31 +59,29 @@ function resolve_url(base_url::String, relative_url::String)::String # Create the resolved URI resolved_uri = URI( - scheme=base_uri.scheme, - userinfo=base_uri.userinfo, - host=base_uri.host, - port=base_uri.port, - path=resolved_path, - query=relative_uri.query, - fragment=relative_uri.fragment + scheme = base_uri.scheme, + userinfo = base_uri.userinfo, + host = base_uri.host, + port = base_uri.port, + path = resolved_path, + query = extracted_uri.query, + fragment = extracted_uri.fragment ) return string(resolved_uri) end - """ - find_urls!(url::AbstractString, - node::Gumbo.HTMLElement, - url_queue::Vector{<:AbstractString} + find_urls_html!(url::AbstractString, node::Gumbo.HTMLElement, url_queue::Vector{<:AbstractString} -Function to recursively find and extract the urls +Function to recursively find tags and extract the urls # Arguments - url: The initial input URL - node: The HTML node of type Gumbo.HTMLElement - url_queue: Vector in which extracted URLs will be appended """ -function find_urls_html!(url::AbstractString, node::Gumbo.HTMLElement, url_queue::Vector{<:AbstractString}) +function find_urls_html!( + url::AbstractString, node::Gumbo.HTMLElement, url_queue::Vector{<:AbstractString}) if Gumbo.tag(node) == :a && haskey(node.attributes, "href") href = node.attributes["href"] if href !== nothing && !isempty(resolve_url(url, href)) @@ -85,6 +89,7 @@ function find_urls_html!(url::AbstractString, node::Gumbo.HTMLElement, url_queue end end + # Go deep in the HTML tags and check if `node` is an tag for child in node.children if isa(child, HTMLElement) find_urls_html!(url, child, url_queue) @@ -92,9 +97,18 @@ function find_urls_html!(url::AbstractString, node::Gumbo.HTMLElement, url_queue end end +""" + find_urls_xml!(url::AbstractString, url_queue::Vector{<:AbstractString}) +Identify URL through regex pattern in xml files and push in `url_queue` +# Arguments +- url: url from which all other URLs will be extracted +- url_queue: Vector in which extracted URLs will be appended +""" function find_urls_xml!(url::AbstractString, url_queue::Vector{<:AbstractString}) + # If a string starts with "http" then it is considered as a URL regardless of it being valid. + # Validity of URLs are checked during HTTP fetch try fetched_content = HTTP.get(url) xml_content = String(fetched_content.body) @@ -108,32 +122,23 @@ function find_urls_xml!(url::AbstractString, url_queue::Vector{<:AbstractString} end end - - """ get_links!(url::AbstractString, url_queue::Vector{<:AbstractString}) -Function to extract urls inside tags +Extract urls inside html or xml files # Arguments - url: url from which all other URLs will be extracted - url_queue: Vector in which extracted URLs will be appended """ function get_urls!(url::AbstractString, url_queue::Vector{<:AbstractString}) - @info "Scraping link: $url" - # println(url) - # try fetched_content = HTTP.get(url) parsed = Gumbo.parsehtml(String(fetched_content.body)) - if (url[end-3:end] == ".xml") + if (url[(end - 3):end] == ".xml") find_urls_xml!(url_xml, url_queue) else find_urls_html!(url, parsed.root, url_queue) end - # print("-------------") - # catch e - # println("Bad URL: $url") - # end end \ No newline at end of file diff --git a/src/make_embeddings.jl b/src/make_embeddings.jl deleted file mode 100644 index f51c865..0000000 --- a/src/make_embeddings.jl +++ /dev/null @@ -1,173 +0,0 @@ -## TODO: Make a function to Check for version number - -""" - report_artifact() - -prints artifact information -""" -function report_artifact(fn_output) - @info("ARTIFACT: $(basename(fn_output))") - @info("sha256: ", bytes2hex(open(sha256, fn_output))) - @info("git-tree-sha1: ", Tar.tree_hash(IOBuffer(inflate_gzip(fn_output)))) -end - - - - -""" - create_output_folders() - -Creates output folders -""" -function create_output_folders(knowledge_pack_path::String) - # Define the folder path - folder_path = joinpath(knowledge_pack_path, "packs") - println("folder_path:", folder_path) - # Check if the folder exists - if !isdir(folder_path) - mkpath(folder_path) - @info "Folder created: $folder_path" - else - @info "Folder already exists: $folder_path" - end - -end - -""" - make_chunks(hostname_url_dict::Dict{AbstractString,Vector{AbstractString}}) - -Parses URLs from hostname_url_dict and saves the chunks - -# Arguments -- hostname_url_dict: Dict with key being hostname and value being a vector of URLs -""" -function make_chunks(hostname_url_dict::Dict{AbstractString,Vector{AbstractString}}, knowledge_pack_path::String) - output_chunks = Vector{SubString{String}}() - output_sources = Vector{String}() - SAVE_CHUNKS = true - CHUNK_SIZE = 512 - for (hostname, urls) in hostname_url_dict - for url in urls - try - chunks, sources = process_paths(url) - append!(output_chunks, chunks) - append!(output_sources, sources) - catch - @error "error!! check url: $url" - end - end - if SAVE_CHUNKS - serialize(joinpath(knowledge_pack_path, "$(hostname)-chunks-$(CHUNK_SIZE).jls"), output_chunks) - serialize(joinpath(knowledge_pack_path, "$(hostname)-sources-$(CHUNK_SIZE).jls"), output_sources) - end - - end - - -end - -function l2_norm_columns(mat::AbstractMatrix) - norm_ = norm.(eachcol(mat)) - return mat ./ norm_' -end -function l2_norm_columns(vect::AbstractVector) - norm_ = norm(vect) - return vect / norm_ -end - - -""" - generate_embeddings() - -Deserializes chunks and sources to generate embeddings -""" -function generate_embeddings(knowledge_pack_path::String) - embedder = RT.BatchEmbedder() - entries = readdir(knowledge_pack_path) - - # Initialize a dictionary to group files by hostname and chunk size - hostname_files = Dict{String,Dict{Int,Dict{String,String}}}() - - # Regular expressions to match the file patterns - chunks_pattern = r"^(.*)-chunks-(\d+)\.jls$" - sources_pattern = r"^(.*)-sources-(\d+)\.jls$" - - # Group files by hostname and chunk size - for file in entries - match_chunks = match(chunks_pattern, file) - match_sources = match(sources_pattern, file) - - if match_chunks !== nothing - hostname = match_chunks.captures[1] - chunk_size = parse(Int, match_chunks.captures[2]) - if !haskey(hostname_files, hostname) - hostname_files[hostname] = Dict{Int,Dict{String,String}}() - end - if !haskey(hostname_files[hostname], chunk_size) - hostname_files[hostname][chunk_size] = Dict{String,String}() - end - hostname_files[hostname][chunk_size]["chunks"] = joinpath(knowledge_pack_path, file) - elseif match_sources !== nothing - hostname = match_sources.captures[1] - chunk_size = parse(Int, match_sources.captures[2]) - if !haskey(hostname_files, hostname) - hostname_files[hostname] = Dict{Int,Dict{String,String}}() - end - if !haskey(hostname_files[hostname], chunk_size) - hostname_files[hostname][chunk_size] = Dict{String,String}() - end - hostname_files[hostname][chunk_size]["sources"] = joinpath(knowledge_pack_path, file) - end - end - - - # Process each pair of files - for (hostname, chunk_files) in hostname_files - for (chunk_size, files) in chunk_files - if haskey(files, "chunks") && haskey(files, "sources") - chunks_file = files["chunks"] - sources_file = files["sources"] - chunks = deserialize(chunks_file) - sources = deserialize(sources_file) - cost_tracker = Threads.Atomic{Float64}(0.0) - full_embeddings = RT.get_embeddings(embedder, chunks; model="text-embedding-3-large", verbose=false, cost_tracker, dimensions=1024) - - fn_output = joinpath(knowledge_pack_path, "packs", "$hostname-textembedding3large-0-Float32__v1.0.tar.gz") - fn_temp = joinpath(knowledge_pack_path, "packs", "pack.hdf5") - h5open(fn_temp, "w") do file - file["chunks"] = chunks - file["sources"] = sources - file["embeddings"] = full_embeddings[1:1024, :] |> l2_norm_columns |> x -> map(>(0), x) - file["type"] = "ChunkIndex" - # file["metadata"] = "$hostname ecosystem docstrings, chunk size $chunk_size, downloaded on 20240330, contains: Makie.jl, AlgebraOfGraphics.jl, GeoMakie.jl, GraphMakie.jl, MakieThemes.jl, TopoPlots.jl, Tyler.jl" - end - command = `tar -cvzf $fn_output -C $(dirname(fn_temp)) $(basename(fn_temp))` - run(command) - report_artifact(fn_output) - - else - @warn "Missing pair for hostname: $hostname, chunk size: $chunk_size" - end - end - end - -end - - - -""" - make_embeddings(input_urls::Vector{<:AbstractString}) - -Entry point to crawl, parse and create embeddings - -# Arguments -- input_urls: vector containing URL strings to parse -""" -function make_embeddings(input_urls::Vector{<:AbstractString}) - hostname_url_dict = Dict{AbstractString,Vector{AbstractString}}() - hostname_url_dict = crawl(input_urls) - knowledge_pack_path = joinpath(@__DIR__, "..", "knowledge_packs") - create_output_folders(knowledge_pack_path) - make_chunks(hostname_url_dict, knowledge_pack_path) - generate_embeddings(knowledge_pack_path) -end \ No newline at end of file diff --git a/src/make_knowledge_packs.jl b/src/make_knowledge_packs.jl new file mode 100644 index 0000000..291a9c7 --- /dev/null +++ b/src/make_knowledge_packs.jl @@ -0,0 +1,222 @@ +""" + report_artifact(fn_output) + +Print artifact information +""" +function report_artifact(fn_output) + @info("ARTIFACT: $(basename(fn_output))") + @info("sha256: ", bytes2hex(open(sha256, fn_output))) + @info("git-tree-sha1: ", Tar.tree_hash(IOBuffer(inflate_gzip(fn_output)))) +end + +""" + create_output_folders(knowledge_pack_path::String) + +Create output folders on the knowledge_pack_path +""" +function create_output_folders(knowledge_pack_path::String) + # Define the folder path + folder_path = joinpath(knowledge_pack_path, "packs") + # Check if the folder exists + if !isdir(folder_path) + mkpath(folder_path) + end +end + +""" + make_chunks(hostname_url_dict::Dict{AbstractString,Vector{AbstractString}}, knowledge_pack_path::String; max_chunk_size::Int=MAX_CHUNK_SIZE, + min_chunk_size::Int=MIN_CHUNK_SIZE) + +Parse URLs from hostname_url_dict and save the chunks + +# Arguments +- hostname_url_dict: Dict with key being hostname and value being a vector of URLs +- knowledge_pack_path: Knowledge pack path +- max_chunk_size: Maximum chunk size +- min_chunk_size: Minimum chunk size +""" +function make_chunks(hostname_url_dict::Dict{AbstractString, Vector{AbstractString}}, + knowledge_pack_path::String; max_chunk_size::Int = MAX_CHUNK_SIZE, + min_chunk_size::Int = MIN_CHUNK_SIZE) + SAVE_CHUNKS = true + for (hostname, urls) in hostname_url_dict + output_chunks = Vector{SubString{String}}() + output_sources = Vector{String}() + for url in urls + try + chunks, sources = process_paths(url; max_chunk_size, min_chunk_size) + append!(output_chunks, chunks) + append!(output_sources, sources) + catch + @error "error!! check url: $url" + end + end + if SAVE_CHUNKS + serialize( + joinpath(knowledge_pack_path, + "$(hostname)-chunks-max-$(max_chunk_size)-min-$(min_chunk_size).jls"), + output_chunks) + serialize( + joinpath(knowledge_pack_path, + "$(hostname)-sources-max-$(max_chunk_size)-min-$(min_chunk_size).jls"), + output_sources) + end + end +end + +""" + l2_norm_columns(mat::AbstractMatrix) + +Normalize the columns of the input embeddings +""" +function l2_norm_columns(mat::AbstractMatrix) + norm_ = norm.(eachcol(mat)) + return mat ./ norm_' +end + +""" + l2_norm_columns(vect::AbstractVector) + +Normalize the columns of the input embeddings +""" +function l2_norm_columns(vect::AbstractVector) + norm_ = norm(vect) + return vect / norm_ +end + +""" + generate_embeddings(knowledge_pack_path::String; model::AbstractString=MODEL, embedding_size::Int=EMBEDDING_SIZE) + +Deserialize chunks and sources to generate embeddings + +# Arguments +- model: Embedding model +- embedding_size: Embedding dimensions +""" +function generate_embeddings(knowledge_pack_path::String; model::AbstractString = MODEL, + embedding_size::Int = EMBEDDING_SIZE) + embedder = RT.BatchEmbedder() + entries = readdir(knowledge_pack_path) + # Initialize a dictionary to group files by hostname and chunk size + hostname_files = Dict{String, Dict{Int, Dict{String, String}}}() + + # Regular expressions to match the file patterns of chunks and sources + chunks_pattern = r"^(.*)-chunks-max-(\d+)-min-(\d+)\.jls$" + sources_pattern = r"^(.*)-sources-max-(\d+)-min-(\d+)\.jls$" + + # chunks_pattern = r"^(.*)-chunks-(\d+)\.jls$" + # sources_pattern = r"^(.*)-sources-(\d+)\.jls$" + + # Group files by hostname and chunk size + for file in entries + match_chunks = match(chunks_pattern, file) + match_sources = match(sources_pattern, file) + + if match_chunks !== nothing + hostname = match_chunks.captures[1] + chunk_size = parse(Int, match_chunks.captures[2]) + if !haskey(hostname_files, hostname) + hostname_files[hostname] = Dict{Int, Dict{String, String}}() + end + if !haskey(hostname_files[hostname], chunk_size) + hostname_files[hostname][chunk_size] = Dict{String, String}() + end + hostname_files[hostname][chunk_size]["chunks"] = joinpath( + knowledge_pack_path, file) + elseif match_sources !== nothing + hostname = match_sources.captures[1] + chunk_size = parse(Int, match_sources.captures[2]) + if !haskey(hostname_files, hostname) + hostname_files[hostname] = Dict{Int, Dict{String, String}}() + end + if !haskey(hostname_files[hostname], chunk_size) + hostname_files[hostname][chunk_size] = Dict{String, String}() + end + hostname_files[hostname][chunk_size]["sources"] = joinpath( + knowledge_pack_path, file) + end + end + # Process each pair of files + for (hostname, chunk_files) in hostname_files + for (chunk_size, files) in chunk_files + if haskey(files, "chunks") && haskey(files, "sources") + chunks_file = files["chunks"] + sources_file = files["sources"] + chunks = deserialize(chunks_file) + sources = deserialize(sources_file) + cost_tracker = Threads.Atomic{Float64}(0.0) + full_embeddings = RT.get_embeddings( + embedder, chunks; model, verbose = false, cost_tracker) + @info "Created embeddings for $hostname. Cost: \$$(round(cost_tracker[], digits=3))" + fn_output = joinpath(knowledge_pack_path, "packs", + "$hostname-textembedding3large-0-Float32__v1.0.tar.gz") + fn_temp = joinpath(knowledge_pack_path, "packs", + "$hostname-textembedding3large-0-Float32__v1.0.hdf5") + h5open(fn_temp, "w") do file + file["chunks"] = chunks + file["sources"] = sources + file["embeddings"] = full_embeddings[1:embedding_size, :] |> + l2_norm_columns |> x -> map(>(0), x) + file["type"] = "ChunkIndex" + # file["metadata"] = "$hostname ecosystem docstrings, chunk size $chunk_size, downloaded on 20240330, contains: Makie.jl, AlgebraOfGraphics.jl, GeoMakie.jl, GraphMakie.jl, MakieThemes.jl, TopoPlots.jl, Tyler.jl" + end + + command = `tar -cvzf $fn_output -C $(dirname(fn_temp)) $(basename(fn_temp))` + run(command) + report_artifact(fn_output) + + else + @warn "Missing pair for hostname: $hostname, chunk size: $chunk_size" + end + end + end +end + +""" + make_knowledge_packs(crawlable_urls::Vector{<:AbstractString}=String[]; single_urls::Vector{<:AbstractString}=String[], + max_chunk_size::Int=MAX_CHUNK_SIZE, min_chunk_size::Int=MIN_CHUNK_SIZE, model::AbstractString=MODEL, embedding_size::Int=EMBEDDING_SIZE) + +Entry point to crawl, parse and generate embeddings + +# Arguments +- crawlable_urls: URLs that should be crawled to find more links +- single_urls: Single page URLs that should just be scraped and parsed. The crawler won't look for more URLs +- max_chunk_size: Maximum chunk size +- min_chunk_size: Minimum chunk size +- model: Embedding model +- embedding_size: Embedding dimensions +""" +function make_knowledge_packs(crawlable_urls::Vector{<:AbstractString} = String[]; + single_urls::Vector{<:AbstractString} = String[], + max_chunk_size::Int = MAX_CHUNK_SIZE, min_chunk_size::Int = MIN_CHUNK_SIZE, + model::AbstractString = MODEL, embedding_size::Int = EMBEDDING_SIZE) + if isempty(crawlable_urls) && isempty(single_urls) + error("At least one of `input_urls` or `single_pages` must be provided.") + end + + hostname_url_dict = Dict{AbstractString, Vector{AbstractString}}() + + if !isempty(crawlable_urls) + hostname_url_dict, visited_url_set = crawl(crawlable_urls) + else + visited_url_set = Set{AbstractString}() + end + for url in single_urls + base_url = get_base_url(url) + if !in(base_url, visited_url_set) + push!(visited_url_set, base_url) + crawlable, sitemap_urls = check_robots_txt("*", base_url) + if crawlable + try + process_hostname!(url, hostname_url_dict) + catch + @error "Bad URL: $base_url" + end + end + end + end + knowledge_pack_path = joinpath(@__DIR__, "..", "knowledge_packs") + create_output_folders(knowledge_pack_path) + make_chunks(hostname_url_dict, knowledge_pack_path; max_chunk_size, min_chunk_size) + generate_embeddings(knowledge_pack_path; model, embedding_size) +end diff --git a/src/parser.jl b/src/parser.jl index d909280..def1a17 100644 --- a/src/parser.jl +++ b/src/parser.jl @@ -1,21 +1,3 @@ -""" -Working: - -Since HTML structure is complex, we need to figure out when do we insert the extracted text in parsed_blocks -ie., should we add the text of child hierarchy and then insert or should we insert now and let the child hierarchy make another insertion. -For this we employ multiple checks. If the current node is heading, directly insert into parsed_blocks. -If the current node is a code block, return the text inside code block with backticks. -If the node is neither heading nor code, then we'll need to go deeper in the hierarchy. -if the current node's tag is from the list [:p, :li, :dt, :dd, :pre, :b, :strong, :i, :cite, :address, :em, :td] -it is assumed that everything inside the tag is part of a single text block with inline code. -But when we go deeper and if there is a code block with size > 50 chars, then our assumption was false. -To correct this, we first insert the previously extracted text, next we insert the current code and additionally indicate the parent recursion iteration -that the current iteration has inserted the previously parsed text, so there is no need for parent iteration to insert the text block again. -We indicate this by a return flag is_text_inserted -""" - - - """ insert_parsed_data!(heading_hierarchy::Dict{Symbol,Any}, parsed_blocks::Vector{Dict{String,Any}}, @@ -30,11 +12,10 @@ Insert the text into parsed_blocks Vector - text_to_insert: Text to be inserted - text_type: The text to be inserted could be heading or a code block or just text """ -function insert_parsed_data!(heading_hierarchy::Dict{Symbol,Any}, - parsed_blocks::Vector{Dict{String,Any}}, - text_to_insert::AbstractString, - text_type::AbstractString) - +function insert_parsed_data!(heading_hierarchy::Dict{Symbol, Any}, + parsed_blocks::Vector{Dict{String, Any}}, + text_to_insert::AbstractString, + text_type::AbstractString) if !isempty(strip(text_to_insert)) push!(parsed_blocks, Dict(text_type => strip(text_to_insert), @@ -42,8 +23,6 @@ function insert_parsed_data!(heading_hierarchy::Dict{Symbol,Any}, end end - - """ process_headings!(node::Gumbo.HTMLElement, heading_hierarchy::Dict{Symbol,Any}, @@ -57,13 +36,13 @@ Process headings. If the current node is heading, directly insert into parsed_bl - parsed_blocks: Vector of Dicts to store parsed text and metadata """ function process_headings!(node::Gumbo.HTMLElement, - heading_hierarchy::Dict{Symbol,Any}, - parsed_blocks::Vector{Dict{String,Any}}) - + heading_hierarchy::Dict{Symbol, Any}, + parsed_blocks::Vector{Dict{String, Any}}) tag_name = Gumbo.tag(node) # Clear headings of equal or lower level for k in collect(keys(heading_hierarchy)) - if k != "header" && Base.parse(Int, last(string(k))) >= Base.parse(Int, last(string(tag_name))) + if k != "header" && + Base.parse(Int, last(string(k))) >= Base.parse(Int, last(string(tag_name))) delete!(heading_hierarchy, k) end end @@ -123,11 +102,10 @@ If the node is neither heading nor code - prev_text_buffer: IO Buffer which contains previous text """ function process_generic_node!(node::Gumbo.HTMLElement, - heading_hierarchy::Dict{Symbol,Any}, - parsed_blocks::Vector{Dict{String,Any}}, - child_new::Bool=true, - prev_text_buffer::IO=IOBuffer(write=true)) - + heading_hierarchy::Dict{Symbol, Any}, + parsed_blocks::Vector{Dict{String, Any}}, + child_new::Bool = true, + prev_text_buffer::IO = IOBuffer(write = true)) seekstart(prev_text_buffer) prev_text = read(prev_text_buffer, String) @@ -142,10 +120,15 @@ function process_generic_node!(node::Gumbo.HTMLElement, # if the current tag belongs in the list, it is assumed that all the text/code should be part of a single paragraph/block, unless, # there occurs a code block with >50 chars, then, previously parsed text is inserted first, then the code block is inserted. - if tag_name in [:p, :li, :dt, :dd, :pre, :b, :strong, :i, :cite, :address, :em, :td, :a, :span, :header] - received_text, is_code_block, is_text_inserted = process_node!(child, heading_hierarchy, parsed_blocks, false, prev_text_buffer) + if tag_name in [:p, :li, :dt, :dd, :pre, :b, :strong, :i, + :cite, :address, :em, :td, :a, :span, :header] + received_text, is_code_block, is_text_inserted = process_node!( + child, heading_hierarchy, parsed_blocks, false, prev_text_buffer) + elseif tag_name in [:script] + continue else - received_text, is_code_block, is_text_inserted = process_node!(child, heading_hierarchy, parsed_blocks, child_new, prev_text_buffer) + received_text, is_code_block, is_text_inserted = process_node!( + child, heading_hierarchy, parsed_blocks, child_new, prev_text_buffer) end # changing text_to_insert to "" to avoid inserting text_to_insert again (as it was inserted by the child recursion call) @@ -180,7 +163,6 @@ function process_generic_node!(node::Gumbo.HTMLElement, print(prev_text_buffer, " " * received_text) text_to_insert = text_to_insert * " " * received_text end - end # if child_new is false, this means new child (new entry in parsed_blocks) should not be created, hence, @@ -195,7 +177,8 @@ function process_generic_node!(node::Gumbo.HTMLElement, # if we're insert text in current node level, then we should insert the previous text if available, # otherwise it'll be inserted when the control goes back to the parent call and hence, order of the insertion will be weird if !isempty(strip(text_to_insert)) - insert_parsed_data!(heading_hierarchy, parsed_blocks, String(take!(prev_text_buffer)), "text") + insert_parsed_data!( + heading_hierarchy, parsed_blocks, String(take!(prev_text_buffer)), "text") is_text_inserted = true end @@ -205,7 +188,6 @@ function process_generic_node!(node::Gumbo.HTMLElement, return "", is_code_block, is_text_inserted end - """ process_docstring!(node::Gumbo.HTMLElement, heading_hierarchy::Dict{Symbol,Any}, @@ -224,11 +206,10 @@ Function to process node of class `docstring` - prev_text_buffer: IO Buffer which contains previous text """ function process_docstring!(node::Gumbo.HTMLElement, - heading_hierarchy::Dict{Symbol,Any}, - parsed_blocks::Vector{Dict{String,Any}}, - child_new::Bool=true, - prev_text_buffer::IO=IOBuffer(write=true)) - + heading_hierarchy::Dict{Symbol, Any}, + parsed_blocks::Vector{Dict{String, Any}}, + child_new::Bool = true, + prev_text_buffer::IO = IOBuffer(write = true)) seekstart(prev_text_buffer) prev_text = read(prev_text_buffer, String) is_code_block = false @@ -248,10 +229,12 @@ function process_docstring!(node::Gumbo.HTMLElement, # Insert "header" if Gumbo.tag(children[1]) == :header heading_hierarchy[:docstring_header] = strip(Gumbo.text(children[1])) - insert_parsed_data!(heading_hierarchy, parsed_blocks, Gumbo.text(children[1]), "docstring_header") + insert_parsed_data!( + heading_hierarchy, parsed_blocks, Gumbo.text(children[1]), "docstring_header") end - received_text, is_code_block, is_text_inserted = process_node!(children[2], heading_hierarchy, parsed_blocks, child_new, prev_text_buffer) + received_text, is_code_block, is_text_inserted = process_node!( + children[2], heading_hierarchy, parsed_blocks, child_new, prev_text_buffer) if !isempty(strip(received_text)) insert_parsed_data!(heading_hierarchy, parsed_blocks, received_text, "text") @@ -279,11 +262,10 @@ Function to process a node - prev_text_buffer: IO Buffer which contains previous text """ function process_node!(node::Gumbo.HTMLElement, - heading_hierarchy::Dict{Symbol,Any}, - parsed_blocks::Vector{Dict{String,Any}}, - child_new::Bool=true, - prev_text_buffer::IO=IOBuffer(write=true)) - + heading_hierarchy::Dict{Symbol, Any}, + parsed_blocks::Vector{Dict{String, Any}}, + child_new::Bool = true, + prev_text_buffer::IO = IOBuffer(write = true)) tag_name = Gumbo.tag(node) if startswith(string(tag_name), "h") && isdigit(last(string(tag_name))) return process_headings!(node, heading_hierarchy, parsed_blocks) @@ -292,15 +274,14 @@ function process_node!(node::Gumbo.HTMLElement, return process_code(node) elseif tag_name == :article && getattr(node, "class", "") == "docstring" - return process_docstring!(node, heading_hierarchy, parsed_blocks, child_new, prev_text_buffer) - + return process_docstring!( + node, heading_hierarchy, parsed_blocks, child_new, prev_text_buffer) end - return process_generic_node!(node, heading_hierarchy, parsed_blocks, child_new, prev_text_buffer) - + return process_generic_node!( + node, heading_hierarchy, parsed_blocks, child_new, prev_text_buffer) end - """ multiple dispatch for process_node!() when node is of type Gumbo.HTMLText """ @@ -310,14 +291,10 @@ function process_node!(node::Gumbo.HTMLText, args...) return strip(Gumbo.text(node)), is_code_block, is_text_inserted end - """ get_base_url(url::AbstractString) -Extracts the base url. - -# Arguments -- `url`: The url string of which, the base url needs to be extracted +Extract the base url. """ function get_base_url(url::AbstractString) parsed_url = URIs.URI(url) @@ -329,7 +306,7 @@ end """ get_html_content(root::Gumbo.HTMLElement) -Returns the main content of the HTML. If not found, returns the whole HTML to parse +Return the main content of the HTML. If not found, return the whole HTML to parse # Arguments - `root`: The HTML root from which content is extracted @@ -338,73 +315,34 @@ function get_html_content(root::Gumbo.HTMLElement) target_ids = Set(["VPContent", "main_content_wrap", "pages-content"]) target_classes = Set(["content", "franklin-content"]) - content_candidates = [el for el in AbstractTrees.PreOrderDFS(root) if el isa HTMLElement] + content_candidates = [el + for el in AbstractTrees.PreOrderDFS(root) if el isa HTMLElement] # First try to find by ID - content_by_id = filter(el -> getattr(el, "id", nothing) in target_ids, content_candidates) + content_by_id = filter( + el -> getattr(el, "id", nothing) in target_ids, content_candidates) if !isempty(content_by_id) return only(content_by_id) end # Fallback to class if no ID matches - content_by_class = filter(el -> getattr(el, "class", nothing) in target_classes, content_candidates) + content_by_class = filter( + el -> getattr(el, "class", nothing) in target_classes, content_candidates) if !isempty(content_by_class) return only(content_by_class) end # Fallback to the root node if no class matches return root - end - """ parse_url(url::AbstractString) -Initiator and main function to parse HTML from url +Initiator and main function to parse HTML from url. Return a Vector of Dict containing Heading/Text/Code along with a Dict of respective metadata # Arguments - `url`: URL string to parse - -# Returns -- A Vector of Dict containing Heading/Text/Code along with a Dict of respective metadata - -# Usage -parsed_blocks = parse_url("https://docs.julialang.org/en/v1/base/multi-threading/") - -# Example -Let the HTML be: - - - - -

Heading 1

-

Heading 2

-

para 1

-

Heading 3

- this is my code block -

This is another h3 under Heading 2

-

This is a paragraph with inline code

- -

Heading 2_2

-

para ewg

- - - - -Output: -Any[ - Dict{String, Any}("URL" => "URL") - Dict{String, Any}("metadata" => Dict{Any, Any}("h1" => "Heading 1"), "heading" => "Heading 1") - Dict{String, Any}("metadata" => Dict{Any, Any}("h1" => "Heading 1", "h2" => "Heading 2"), "heading" => "Heading 2") - Dict{String, Any}("metadata" => Dict{Any, Any}("h1" => "Heading 1", "h2" => "Heading 2"), "text" => "para 1") - Dict{String, Any}("metadata" => Dict{Any, Any}("h1" => "Heading 1", "h3" => "Heading 3", "h2" => "Heading 2"), "heading" => "Heading 3") - Dict{String, Any}("code" => "```julia this is my code block```", "metadata" => Dict{Any, Any}("h1" => "Heading 1", "h3" => "Heading 3", "h2" => "Heading 2")) - Dict{String, Any}("metadata" => Dict{Any, Any}("h1" => "Heading 1", "h3" => "This is another h3 under Heading 2", "h2" => "Heading 2"), "heading" => "This is another h3 under Heading 2") - Dict{String, Any}("metadata" => Dict{Any, Any}("h1" => "Heading 1", "h3" => "This is another h3 under Heading 2", "h2" => "Heading 2"), "text" => "This is a paragraph with inline code") - Dict{String, Any}("metadata" => Dict{Any, Any}("h1" => "Heading 1", "h2" => "Heading 2_2"), "heading" => "Heading 2_2") - Dict{String, Any}("metadata" => Dict{Any, Any}("h1" => "Heading 1", "h2" => "Heading 2_2"), "text" => "para ewg") -] """ function parse_url_to_blocks(url::AbstractString) @@ -419,8 +357,8 @@ function parse_url_to_blocks(url::AbstractString) # title = [el # for el in AbstractTrees.PreOrderDFS(r_parsed.root) # if el isa HTMLElement && tag(el) == :title] .|> text |> Base.Fix2(join, " / ") - parsed_blocks = Vector{Dict{String,Any}}([Dict("Source" => base_url)]) - heading_hierarchy = Dict{Symbol,Any}() + parsed_blocks = Vector{Dict{String, Any}}([Dict("Source" => base_url)]) + heading_hierarchy = Dict{Symbol, Any}() process_node!(get_html_content(parsed.root), heading_hierarchy, parsed_blocks) return parsed_blocks catch diff --git a/src/preparation.jl b/src/preparation.jl index ab8d7b5..9979155 100644 --- a/src/preparation.jl +++ b/src/preparation.jl @@ -1,9 +1,7 @@ -# include("recursive_splitter.jl") -include("utils.jl") """ get_header_path(d::Dict) -Concatenates the h1, h2, h3 keys from the metadata of a Dict +Concatenate the h1, h2, h3 keys from the metadata of a Dict # Examples ```julia @@ -12,7 +10,7 @@ get_header_path(d) # Output: "Axis/Attributes/yzoomkey" ``` """ -function get_header_path(d::Dict) +function get_header_path(d::Dict{String,Any}) metadata = get(d, "metadata", Dict{Any,Any}()) isempty(metadata) && return nothing keys_ = [:h1, :h2, :h3] @@ -21,8 +19,13 @@ function get_header_path(d::Dict) end -"Roll-up chunks (that have the same header!), so we can split them later by to get the desired length" -function roll_up_chunks(parsed_blocks, url::AbstractString; separator::String="") + +""" + roll_up_chunks(parsed_blocks::Vector{Dict{String,Any}}, url::AbstractString; separator::String="") + +Roll-up chunks (that have the same header!), so we can split them later by to get the desired length +""" +function roll_up_chunks(parsed_blocks::Vector{Dict{String,Any}}, url::AbstractString; separator::String="") docs = String[] io = IOBuffer() last_header = nothing @@ -35,7 +38,7 @@ function roll_up_chunks(parsed_blocks, url::AbstractString; separator::String="< str = String(take!(io)) if !isempty(str) push!(docs, str) - src = url * (isnothing(last_header) ? "" : "::$last_header") + src = url * (isnothing(last_header) ? "" : " - $last_header") push!(sources, src) end last_header = header @@ -48,7 +51,7 @@ function roll_up_chunks(parsed_blocks, url::AbstractString; separator::String="< str = String(take!(io)) if !isempty(str) push!(docs, str) - src = url * (isnothing(last_header) ? "" : "::$last_header") + src = url * (isnothing(last_header) ? "" : " - $last_header") push!(sources, src) end return docs, sources @@ -56,19 +59,23 @@ end struct DocParserChunker <: RT.AbstractChunker end -""" - RT.get_chunks(chunker::DocParserChunker, - html_files::Vector{<:AbstractString}; - sources::AbstractVector{<:AbstractString}=html_files, - verbose::Bool=true, - separators=["\n\n", ". ", "\n", " "], max_length::Int=256) -Extracts chunks from HTML files, by parsing the content in the HTML, rolling up chunks by headers, and splits them by separators to get the desired length. +""" + RT.get_chunks(chunker::DocParserChunker, url::AbstractString; + verbose::Bool=true, separators=["\n\n", ". ", "\n", " "], max_chunk_size::Int=MAX_CHUNK_SIZE) + +Extract chunks from HTML files, by parsing the content in the HTML, rolling up chunks by headers, +and splits them by separators to get the desired length. + +# Arguments +- chunker: DocParserChunker +- url: URL of the webpage to extract chunks +- verbose: Bool to print the log +- separators: Chunk separators +- max_chunk_size Maximum chunk size """ function RT.get_chunks(chunker::DocParserChunker, url::AbstractString; - verbose::Bool=true, - separators=["\n\n", ". ", "\n", " "], max_length::Int=256) - + verbose::Bool=true, separators=["\n\n", ". ", "\n", " "], max_chunk_size::Int=MAX_CHUNK_SIZE) SEP = "" sources = AbstractVector{<:AbstractString} @@ -84,8 +91,9 @@ function RT.get_chunks(chunker::DocParserChunker, url::AbstractString; ## roll up chunks by SEP splitter, then remove it later for (doc, src) in zip(docs_, sources_) ## roll up chunks by SEP splitter, then remove it later - doc_chunks = PT.recursive_splitter(doc, [SEP, separators...]; max_length) .|> + doc_chunks = PT.recursive_splitter(doc, [SEP, separators...]; max_length=max_chunk_size) .|> x -> replace(x, SEP => " ") .|> strip |> x -> filter(!isempty, x) + chunk_lengths = length.(doc_chunks) # skip if no chunks found isempty(doc_chunks) && continue append!(output_chunks, doc_chunks) @@ -96,20 +104,24 @@ end -"Process folders provided in `paths`. In each, take all HTML files, scrape them, chunk them and postprocess them." -function process_paths(url::AbstractString, max_length::Int=512) +""" + process_paths(url::AbstractString; max_chunk_size::Int=MAX_CHUNK_SIZE, min_chunk_size::Int=MIN_CHUNK_SIZE) + +Process folders provided in `paths`. In each, take all HTML files, scrape them, chunk them and postprocess them. +""" +function process_paths(url::AbstractString; max_chunk_size::Int=MAX_CHUNK_SIZE, min_chunk_size::Int=MIN_CHUNK_SIZE) output_chunks = Vector{SubString{String}}() output_sources = Vector{String}() - chunks, sources = RT.get_chunks(DocParserChunker(), url; max_length) + chunks, sources = RT.get_chunks(DocParserChunker(), url; max_chunk_size) append!(output_chunks, chunks) append!(output_sources, sources) @info "Scraping done: $(length(output_chunks)) chunks" - postprocess_chunks(output_chunks, output_sources; min_length=40, skip_code=true) + output_chunks, output_sources = postprocess_chunks(output_chunks, output_sources; min_chunk_size, skip_code=true) return output_chunks, output_sources end diff --git a/src/user_preferences.jl b/src/user_preferences.jl new file mode 100644 index 0000000..98794c6 --- /dev/null +++ b/src/user_preferences.jl @@ -0,0 +1,4 @@ +global MIN_CHUNK_SIZE = 40 +global MAX_CHUNK_SIZE = 256 +global MODEL = "text-embedding-3-large" +global EMBEDDING_SIZE = 1024 \ No newline at end of file diff --git a/src/utils.jl b/src/utils.jl index 4bf1e07..e8dc014 100644 --- a/src/utils.jl +++ b/src/utils.jl @@ -1,4 +1,9 @@ -"Finds duplicates in a list of chunks using SHA-256 hash. Returns a bit vector of the same length as the input list, where `true` indicates a duplicate (second instance of the same text)." +""" + find_duplicates(chunks::AbstractVector{<:AbstractString}) + +Find duplicates in a list of chunks using SHA-256 hash. Returns a bit vector of the same length as the input list, +where `true` indicates a duplicate (second instance of the same text). +""" function find_duplicates(chunks::AbstractVector{<:AbstractString}) # hash the chunks for easier search hashed_chunks = bytes2hex.(sha256.(chunks)) @@ -20,20 +25,34 @@ function find_duplicates(chunks::AbstractVector{<:AbstractString}) return duplicates end -"Removes chunks that are duplicated in the input list of chunks and their corresponding sources." +""" + remove_duplicates(chunks::AbstractVector{<:AbstractString}, sources::AbstractVector{<:AbstractString}) + +Remove chunks that are duplicated in the input list of chunks and their corresponding sources. +""" function remove_duplicates(chunks::AbstractVector{<:AbstractString}, sources::AbstractVector{<:AbstractString}) idxs = find_duplicates(chunks) return chunks[.!idxs], sources[.!idxs] end -"Removes chunks that are shorter than a specified length (`min_length`) from the input list of chunks and their corresponding sources." -function remove_short_chunks(chunks::AbstractVector{<:AbstractString}, sources::AbstractVector{<:AbstractString}; min_length::Int=40, skip_code::Bool=true) + +""" + remove_short_chunks(chunks::AbstractVector{<:AbstractString}, sources::AbstractVector{<:AbstractString}; + min_chunk_size::Int=MIN_CHUNK_SIZE, skip_code::Bool=true) + +Remove chunks that are shorter than a specified length (`min_length`) from the input list of chunks and their corresponding sources. +""" +function remove_short_chunks(chunks::AbstractVector{<:AbstractString}, sources::AbstractVector{<:AbstractString}; + min_chunk_size::Int=MIN_CHUNK_SIZE, skip_code::Bool=true) + + chunk_lengths = length.(chunks) idx = if skip_code - ## Keep short chunks if they contain code (might be combined with some preceding/suceeeding text) - findall(x -> length(x) >= min_length || occursin("```", x), chunks) + ## Keep short chunks if they contain code (might be combined with some preceding/succeeding text) + findall(x -> length(x) >= min_chunk_size || occursin("```", x), chunks) else - findall(x -> length(x) >= min_length, chunks) + findall(x -> length(x) >= min_chunk_size, chunks) end + chunk_lengths = length.(chunks[idx]) return chunks[idx], sources[idx] end @@ -42,14 +61,24 @@ function replace_local_paths(sources::AbstractVector{<:AbstractString}, paths::A @assert length(paths) == length(websites) "Length of `paths` must match length of `websites`" replacement_pairs = paths .=> websites output = map(x -> replace(x, replacement_pairs...), sources) + return output end -"Post-processes the input list of chunks and their corresponding sources by removing short chunks and duplicates." -function postprocess_chunks(chunks::AbstractVector{<:AbstractString}, sources::AbstractVector{<:AbstractString}; min_length::Int=40, skip_code::Bool=true, - paths::Union{Nothing,AbstractVector{<:AbstractString}}=nothing, websites::Union{Nothing,AbstractVector{<:AbstractString}}=nothing) + + +""" + function postprocess_chunks(chunks::AbstractVector{<:AbstractString}, sources::AbstractVector{<:AbstractString}; + min_chunk_size::Int=MIN_CHUNK_SIZE, skip_code::Bool=true, paths::Union{Nothing,AbstractVector{<:AbstractString}}=nothing, + websites::Union{Nothing,AbstractVector{<:AbstractString}}=nothing) + +Post-process the input list of chunks and their corresponding sources by removing short chunks and duplicates. +""" +function postprocess_chunks(chunks::AbstractVector{<:AbstractString}, sources::AbstractVector{<:AbstractString}; + min_chunk_size::Int=MIN_CHUNK_SIZE, skip_code::Bool=true, paths::Union{Nothing,AbstractVector{<:AbstractString}}=nothing, + websites::Union{Nothing,AbstractVector{<:AbstractString}}=nothing) len_ = length(chunks) - chunks, sources = remove_short_chunks(chunks, sources; min_length, skip_code) + chunks, sources = remove_short_chunks(chunks, sources; min_chunk_size, skip_code) @info "Removed $(len_ - length(chunks)) short chunks" len_ = length(chunks) @@ -63,6 +92,31 @@ function postprocess_chunks(chunks::AbstractVector{<:AbstractString}, sources::A end return chunks, sources +end + +""" + function remove_urls_from_index(index_path::AbstractString, prefix_urls=Vector{<:AbstractString}) + +Remove chunks and sources corresponding to URLs starting with `prefix_urls` +""" +function remove_urls_from_index(index_path::AbstractString, prefix_urls=Vector{<:AbstractString}) + @assert endswith(file_path, ".hdf5") "Provided file path must end with `.hdf5` (see HDF5.jl)." + h5open(index_path, "r+") do orig_file + # Load the sources dataset into a Julia array + sources = read(orig_file["sources"]) + chunks = read(orig_file["chunks"]) + embeddings = read(orig_file["embeddings"]) + for url_to_remove in prefix_urls + indices_to_remove = findall(x -> startswith(x, url_to_remove), sources) + sources = deleteat!(sources, indices_to_remove) + chunks = deleteat!(chunks, indices_to_remove) + embeddings = embeddings[:, setdiff(1:size(embeddings, 2), indices_to_remove)] + end + + write(file["sources"], sources) + write(file["chunks"], chunks) + write(file["embeddings"], embeddings) + end end \ No newline at end of file diff --git a/test/runtests.jl b/test/runtests.jl index 78a78b4..4b4a92c 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -10,21 +10,22 @@ using LinearAlgebra, Unicode, SparseArrays using HDF5 using Tar using Inflate - using SHA using Serialization, URIs -include("..\\src\\crawl.jl") -include("..\\src\\extract_urls.jl") -include("..\\src\\parser.jl") -include("..\\src\\preparation.jl") +include(joinpath("..", "src", "crawl.jl")) +include(joinpath("..", "src", "extract_urls.jl")) +include(joinpath("..", "src", "parser.jl")) +include(joinpath("..", "src", "preparation.jl")) +include(joinpath("..", "src", "user_preferences.jl")) +include(joinpath("..", "src", "utils.jl")) + urls = Vector{AbstractString}(["https://docs.julialang.org/en/v1/"]) url = urls[1] queue = Vector{AbstractString}() -@testset "check robots.txt" begin +@testset "HTTP" begin @test HTTP.get(url) != nothing - result, sitemap_queue = check_robots_txt("*", url) @test result == true end @@ -38,12 +39,13 @@ end parsed_blocks = parse_url_to_blocks(url) @test length(parsed_blocks) > 0 SEP = "" - docs_, sources_ = roll_up_chunks(parsed_blocks, url; separator=SEP) - @test length(docs_) > 0 && length(sources_) > 0 && docs_[1] != nothing && sources_[1] != nothing + docs_, sources_ = roll_up_chunks(parsed_blocks, url; separator = SEP) + @test length(docs_) > 0 && length(sources_) > 0 && docs_[1] != nothing && + sources_[1] != nothing end @testset "overall test" begin chunks, sources = process_paths(url) - @test length(chunks) > 0 && length(sources) > 0 && chunks[1] != nothing && sources[1] != nothing - + @test length(chunks) > 0 && length(sources) > 0 && chunks[1] != nothing && + sources[1] != nothing end From 0782e01ba709f18bda1cca77cb83e8b73922630e Mon Sep 17 00:00:00 2001 From: Shreyas Agrawal <48771895+splendidbug@users.noreply.github.com> Date: Sun, 11 Aug 2024 03:19:20 -0700 Subject: [PATCH 3/7] dependency changes --- .github/workflows/CI.yml | 1 - Project.toml | 4 ++++ src/DocsScraper.jl | 3 --- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/.github/workflows/CI.yml b/.github/workflows/CI.yml index 874943f..0b6af25 100644 --- a/.github/workflows/CI.yml +++ b/.github/workflows/CI.yml @@ -24,7 +24,6 @@ jobs: matrix: version: - "1.10" - - "nightly" os: - ubuntu-latest arch: diff --git a/Project.toml b/Project.toml index 705a918..bc05f3f 100644 --- a/Project.toml +++ b/Project.toml @@ -14,6 +14,8 @@ PromptingTools = "670122d1-24a8-4d70-bfce-740807c42192" Tar = "a4e569a6-e804-4fa4-b0f3-eef7a1d5b13e" URIParser = "30578b45-9adc-5946-b283-645ec420af67" URIs = "5c2747f8-b7ea-4ff2-ba2e-563bfd36b1d4" +LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e" +SparseArrays = "2f01184e-e22b-5df5-ae63-d93ebab69eaf" [compat] AbstractTrees = "0.4.5" @@ -26,6 +28,8 @@ PromptingTools = "0.36.0" URIParser = "0.4.1" URIs = "1.5.1" Tar = "1.10.0" +LinearAlgebra = "<0.0.1, 1" +SparseArrays = "<0.0.1, 1" [extras] Aqua = "4c88cf16-eb10-579e-8560-4a9242c79595" diff --git a/src/DocsScraper.jl b/src/DocsScraper.jl index e78dde7..40bc3ee 100644 --- a/src/DocsScraper.jl +++ b/src/DocsScraper.jl @@ -2,8 +2,6 @@ module DocsScraper using HTTP, Gumbo, AbstractTrees, URIs using Gumbo: HTMLDocument, HTMLElement using EzXML -using Pkg -Pkg.develop(PackageSpec(path="C:\\Users\\shrey\\Desktop\\stuff\\assignments\\grad\\projects\\Julia\\PromptingTools.jl")) using PromptingTools const PT = PromptingTools const RT = PromptingTools.Experimental.RAGTools @@ -27,5 +25,4 @@ include("user_preferences.jl") include("utils.jl") export remove_urls_from_index - end \ No newline at end of file From 52998a90e7f8373d879537e8242309a93c227a39 Mon Sep 17 00:00:00 2001 From: Shreyas Agrawal <48771895+splendidbug@users.noreply.github.com> Date: Sun, 11 Aug 2024 03:30:37 -0700 Subject: [PATCH 4/7] dependency changes --- .github/workflows/CI.yml | 62 ++++++++++++++++++++-------------------- Project.toml | 15 ++++++---- 2 files changed, 41 insertions(+), 36 deletions(-) diff --git a/.github/workflows/CI.yml b/.github/workflows/CI.yml index 0b6af25..1c00a7a 100644 --- a/.github/workflows/CI.yml +++ b/.github/workflows/CI.yml @@ -43,34 +43,34 @@ jobs: files: lcov.info token: ${{ secrets.CODECOV_TOKEN }} fail_ci_if_error: false - docs: - name: Documentation - runs-on: ubuntu-latest - permissions: - actions: write # needed to allow julia-actions/cache to proactively delete old caches that it has created - contents: write - statuses: write - steps: - - uses: actions/checkout@v4 - - uses: julia-actions/setup-julia@v2 - with: - version: "1" - - uses: julia-actions/cache@v2 - - name: Configure doc environment - shell: julia --project=docs --color=yes {0} - run: | - using Pkg - Pkg.develop(PackageSpec(path=pwd())) - Pkg.instantiate() - - uses: julia-actions/julia-buildpkg@v1 - - uses: julia-actions/julia-docdeploy@v1 - env: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - DOCUMENTER_KEY: ${{ secrets.DOCUMENTER_KEY }} - - name: Run doctests - shell: julia --project=docs --color=yes {0} - run: | - using Documenter: DocMeta, doctest - using DocsScraper - DocMeta.setdocmeta!(DocsScraper, :DocTestSetup, :(using DocsScraper); recursive=true) - doctest(DocsScraper) + # docs: + # name: Documentation + # runs-on: ubuntu-latest + # permissions: + # actions: write # needed to allow julia-actions/cache to proactively delete old caches that it has created + # contents: write + # statuses: write + # steps: + # - uses: actions/checkout@v4 + # - uses: julia-actions/setup-julia@v2 + # with: + # version: "1" + # - uses: julia-actions/cache@v2 + # - name: Configure doc environment + # shell: julia --project=docs --color=yes {0} + # run: | + # using Pkg + # Pkg.develop(PackageSpec(path=pwd())) + # Pkg.instantiate() + # - uses: julia-actions/julia-buildpkg@v1 + # - uses: julia-actions/julia-docdeploy@v1 + # env: + # GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + # DOCUMENTER_KEY: ${{ secrets.DOCUMENTER_KEY }} + # - name: Run doctests + # shell: julia --project=docs --color=yes {0} + # run: | + # using Documenter: DocMeta, doctest + # using DocsScraper + # DocMeta.setdocmeta!(DocsScraper, :DocTestSetup, :(using DocsScraper); recursive=true) + # doctest(DocsScraper) diff --git a/Project.toml b/Project.toml index bc05f3f..16502d1 100644 --- a/Project.toml +++ b/Project.toml @@ -10,12 +10,15 @@ Gumbo = "708ec375-b3d6-5a57-a7ce-8257bf98657a" HDF5 = "f67ccb44-e63f-5c2f-98bd-6dc0ccc4ba2f" HTTP = "cd3eb016-35fb-5094-929b-558a96fad6f3" Inflate = "d25df0c9-e2be-5dd7-82c8-3ad0b3e990b9" +LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e" PromptingTools = "670122d1-24a8-4d70-bfce-740807c42192" +SHA = "ea8e919c-243c-51af-8825-aaa63cd721ce" +Serialization = "9e88b42a-f829-5b0c-bbe9-9e923198166b" +SparseArrays = "2f01184e-e22b-5df5-ae63-d93ebab69eaf" Tar = "a4e569a6-e804-4fa4-b0f3-eef7a1d5b13e" URIParser = "30578b45-9adc-5946-b283-645ec420af67" URIs = "5c2747f8-b7ea-4ff2-ba2e-563bfd36b1d4" -LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e" -SparseArrays = "2f01184e-e22b-5df5-ae63-d93ebab69eaf" +Unicode = "4ec0a83e-493e-50e2-b9ac-8f72acf5a8f5" [compat] AbstractTrees = "0.4.5" @@ -24,12 +27,14 @@ Gumbo = "0.8.2" HDF5 = "0.17.2" HTTP = "1.10.4" Inflate = "0.1.5" +LinearAlgebra = "<0.0.1, 1" PromptingTools = "0.36.0" +SparseArrays = "<0.0.1, 1" +Tar = "1.10.0" URIParser = "0.4.1" URIs = "1.5.1" -Tar = "1.10.0" -LinearAlgebra = "<0.0.1, 1" -SparseArrays = "<0.0.1, 1" +SHA = "0.7.0" + [extras] Aqua = "4c88cf16-eb10-579e-8560-4a9242c79595" From 6f32002254aaa8ff829225fb9d1a92cd1980398c Mon Sep 17 00:00:00 2001 From: Shreyas Agrawal <48771895+splendidbug@users.noreply.github.com> Date: Wed, 14 Aug 2024 15:00:36 -0700 Subject: [PATCH 5/7] code imptrovements --- .JuliaFormatter.toml | 1 + .github/workflows/CI.yml | 70 +++++++-------- .gitignore | 4 +- .vscode/settings.json | 6 -- LICENSE | 2 +- Project.toml | 38 ++++---- docs/Project.toml | 10 +-- docs/make.jl | 29 ++++--- docs/src/index.md | 8 +- src/DocsScraper.jl | 11 ++- src/crawl.jl | 18 ++-- src/extract_package_name.jl | 162 +++++++++++++++++++++++++++++++++++ src/extract_urls.jl | 2 +- src/make_knowledge_packs.jl | 69 ++++++++++----- src/parser.jl | 5 +- src/preparation.jl | 33 ++++--- src/user_preferences.jl | 4 +- src/utils.jl | 72 ++++++++++++---- test/crawl.jl | 7 ++ test/make_knowledge_packs.jl | 8 ++ test/parser.jl | 11 +++ test/runtests.jl | 56 ++---------- test/utils.jl | 10 +++ 23 files changed, 427 insertions(+), 209 deletions(-) delete mode 100644 .vscode/settings.json create mode 100644 src/extract_package_name.jl create mode 100644 test/crawl.jl create mode 100644 test/make_knowledge_packs.jl create mode 100644 test/parser.jl create mode 100644 test/utils.jl diff --git a/.JuliaFormatter.toml b/.JuliaFormatter.toml index 5657bd0..9601a61 100644 --- a/.JuliaFormatter.toml +++ b/.JuliaFormatter.toml @@ -1,2 +1,3 @@ # See https://domluna.github.io/JuliaFormatter.jl/stable/ for a list of options style = "sciml" +ignore = ["knowledge_packs"] \ No newline at end of file diff --git a/.github/workflows/CI.yml b/.github/workflows/CI.yml index 1c00a7a..5cd2adb 100644 --- a/.github/workflows/CI.yml +++ b/.github/workflows/CI.yml @@ -38,39 +38,39 @@ jobs: - uses: julia-actions/julia-buildpkg@v1 - uses: julia-actions/julia-runtest@v1 - uses: julia-actions/julia-processcoverage@v1 - - uses: codecov/codecov-action@v4 + # - uses: codecov/codecov-action@v4 + # with: + # files: lcov.info + # token: ${{ secrets.CODECOV_TOKEN }} + # fail_ci_if_error: false + docs: + name: Documentation + runs-on: ubuntu-latest + permissions: + actions: write # needed to allow julia-actions/cache to proactively delete old caches that it has created + contents: write + statuses: write + steps: + - uses: actions/checkout@v4 + - uses: julia-actions/setup-julia@v2 with: - files: lcov.info - token: ${{ secrets.CODECOV_TOKEN }} - fail_ci_if_error: false - # docs: - # name: Documentation - # runs-on: ubuntu-latest - # permissions: - # actions: write # needed to allow julia-actions/cache to proactively delete old caches that it has created - # contents: write - # statuses: write - # steps: - # - uses: actions/checkout@v4 - # - uses: julia-actions/setup-julia@v2 - # with: - # version: "1" - # - uses: julia-actions/cache@v2 - # - name: Configure doc environment - # shell: julia --project=docs --color=yes {0} - # run: | - # using Pkg - # Pkg.develop(PackageSpec(path=pwd())) - # Pkg.instantiate() - # - uses: julia-actions/julia-buildpkg@v1 - # - uses: julia-actions/julia-docdeploy@v1 - # env: - # GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - # DOCUMENTER_KEY: ${{ secrets.DOCUMENTER_KEY }} - # - name: Run doctests - # shell: julia --project=docs --color=yes {0} - # run: | - # using Documenter: DocMeta, doctest - # using DocsScraper - # DocMeta.setdocmeta!(DocsScraper, :DocTestSetup, :(using DocsScraper); recursive=true) - # doctest(DocsScraper) + version: "1" + - uses: julia-actions/cache@v2 + - name: Configure doc environment + shell: julia --project=docs --color=yes {0} + run: | + using Pkg + Pkg.develop(PackageSpec(path=pwd())) + Pkg.instantiate() + - uses: julia-actions/julia-buildpkg@v1 + - uses: julia-actions/julia-docdeploy@v1 + env: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + DOCUMENTER_KEY: ${{ secrets.DOCUMENTER_KEY }} + - name: Run doctests + shell: julia --project=docs --color=yes {0} + run: | + using Documenter: DocMeta, doctest + using DocsScraper + DocMeta.setdocmeta!(DocsScraper, :DocTestSetup, :(using DocsScraper); recursive=true) + doctest(DocsScraper) diff --git a/.gitignore b/.gitignore index 8e2d4ba..4a1c7f4 100644 --- a/.gitignore +++ b/.gitignore @@ -4,4 +4,6 @@ knowledge_packs/ Manifest.toml /Manifest.toml /docs/Manifest.toml -/docs/build/ \ No newline at end of file +/docs/build/ +.vscode/** +**/.DS_Store \ No newline at end of file diff --git a/.vscode/settings.json b/.vscode/settings.json deleted file mode 100644 index 9238ca7..0000000 --- a/.vscode/settings.json +++ /dev/null @@ -1,6 +0,0 @@ -{ - "cSpell.words": [ - "eachmatch", - "postprocess" - ] -} diff --git a/LICENSE b/LICENSE index d7bd022..183f1b7 100644 --- a/LICENSE +++ b/LICENSE @@ -1,6 +1,6 @@ MIT License -Copyright (c) Shreyas Agrawal @splendidbug and J S @svilupp +Copyright (c) Shreyas Agrawal @splendidbug and contributors Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal diff --git a/Project.toml b/Project.toml index 16502d1..1fb77c2 100644 --- a/Project.toml +++ b/Project.toml @@ -1,40 +1,46 @@ name = "DocsScraper" uuid = "bd71d052-5e08-40cc-a492-eb4e8da4b649" -authors = ["Shreyas Agrawal @splendidbug and J S @svilupp"] +authors = ["Shreyas Agrawal @splendidbug and contributors"] version = "0.1.0" [deps] AbstractTrees = "1520ce14-60c1-5f80-bbc7-55ef81b5835c" +Dates = "ade2ca70-3891-5945-98fb-dc099432e06a" EzXML = "8f5d6c58-4d21-5cfd-889c-e3ad7ee6a615" Gumbo = "708ec375-b3d6-5a57-a7ce-8257bf98657a" HDF5 = "f67ccb44-e63f-5c2f-98bd-6dc0ccc4ba2f" HTTP = "cd3eb016-35fb-5094-929b-558a96fad6f3" Inflate = "d25df0c9-e2be-5dd7-82c8-3ad0b3e990b9" +JSON = "682c06a0-de6a-54ab-a142-c8b1cf79cde6" LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e" PromptingTools = "670122d1-24a8-4d70-bfce-740807c42192" SHA = "ea8e919c-243c-51af-8825-aaa63cd721ce" Serialization = "9e88b42a-f829-5b0c-bbe9-9e923198166b" SparseArrays = "2f01184e-e22b-5df5-ae63-d93ebab69eaf" Tar = "a4e569a6-e804-4fa4-b0f3-eef7a1d5b13e" -URIParser = "30578b45-9adc-5946-b283-645ec420af67" URIs = "5c2747f8-b7ea-4ff2-ba2e-563bfd36b1d4" Unicode = "4ec0a83e-493e-50e2-b9ac-8f72acf5a8f5" [compat] -AbstractTrees = "0.4.5" -EzXML = "1.2.0" -Gumbo = "0.8.2" -HDF5 = "0.17.2" -HTTP = "1.10.4" -Inflate = "0.1.5" -LinearAlgebra = "<0.0.1, 1" -PromptingTools = "0.36.0" -SparseArrays = "<0.0.1, 1" -Tar = "1.10.0" -URIParser = "0.4.1" -URIs = "1.5.1" -SHA = "0.7.0" - +AbstractTrees = "0.4" +Aqua = "0.8" +Dates = "1" +EzXML = "1.2" +Gumbo = "0.8" +HDF5 = "0.17" +HTTP = "1.10" +Inflate = "0.1" +LinearAlgebra = "1" +PromptingTools = "0.48" +SHA = "0.7" +Serialization = "1" +SparseArrays = "1" +Tar = "1" +Test = "1" +URIs = "1.5" +Unicode = "1" +julia = "1.10" +JSON = "0.21" [extras] Aqua = "4c88cf16-eb10-579e-8560-4a9242c79595" diff --git a/docs/Project.toml b/docs/Project.toml index 41b0b18..15c39b1 100644 --- a/docs/Project.toml +++ b/docs/Project.toml @@ -1,19 +1,15 @@ [deps] AbstractTrees = "1520ce14-60c1-5f80-bbc7-55ef81b5835c" +DocsScraper = "bd71d052-5e08-40cc-a492-eb4e8da4b649" +Documenter = "e30172f5-a6a5-5a46-863b-614d45cd2de4" DotEnv = "4dc1fcf4-5e3b-5448-94ab-0c38ec0385c1" EzXML = "8f5d6c58-4d21-5cfd-889c-e3ad7ee6a615" Gumbo = "708ec375-b3d6-5a57-a7ce-8257bf98657a" HDF5 = "f67ccb44-e63f-5c2f-98bd-6dc0ccc4ba2f" HTTP = "cd3eb016-35fb-5094-929b-558a96fad6f3" Inflate = "d25df0c9-e2be-5dd7-82c8-3ad0b3e990b9" +LiveServer = "16fef848-5104-11e9-1b77-fb7a48bbb589" PromptingTools = "670122d1-24a8-4d70-bfce-740807c42192" Tar = "a4e569a6-e804-4fa4-b0f3-eef7a1d5b13e" URIParser = "30578b45-9adc-5946-b283-645ec420af67" URIs = "5c2747f8-b7ea-4ff2-ba2e-563bfd36b1d4" - -[compat] -AbstractTrees = "0.4.5" -Gumbo = "0.8.2" -HTTP = "1.10.4" -PromptingTools = "0.36.0" -URIs = "1.5.1" diff --git a/docs/make.jl b/docs/make.jl index a54f0f6..47bd6f5 100644 --- a/docs/make.jl +++ b/docs/make.jl @@ -1,23 +1,24 @@ using DocsScraper using Documenter -DocMeta.setdocmeta!(DocsScraper, :DocTestSetup, :(using DocsScraper); recursive=true) +DocMeta.setdocmeta!(DocsScraper, :DocTestSetup, :(using DocsScraper); recursive = true) makedocs(; - modules=[DocsScraper], - authors="Shreyas Agrawal @splendidbug and J S @svilupp", - sitename="DocsScraper.jl", - # format=Documenter.HTML(; - # canonical="https://Shreyas Agrawal.github.io/DocsScraper.jl", - # edit_link="master", - # assets=String[], - # ), - pages=[ - "Home" => "index.md", - ], + modules = [DocsScraper], + authors = "Shreyas Agrawal @splendidbug and contributors", + sitename = "DocsScraper.jl", + repo = "https://github.com/splendidbug/DocsScraper.jl/blob/{commit}{path}#{line}", + format = Documenter.HTML(; + repolink = "https://github.com/splendidbug/DocsScraper.jl", + canonical = "https://splendidbug.github.io/DocsScraper.jl", + edit_link = "main", + assets = String[]), + pages = [ + "API Index" => "index.md" + ] ) deploydocs(; - repo="github.com/Shreyas Agrawal/DocsScraper.jl", - devbranch="main", + repo = "github.com/splendidbug/DocsScraper.jl", + devbranch = "main" ) diff --git a/docs/src/index.md b/docs/src/index.md index a6f0129..c30e1af 100644 --- a/docs/src/index.md +++ b/docs/src/index.md @@ -1,4 +1,8 @@ -# DocsScraper +# Reference -## Documentation +```@index +``` +```@autodocs +Modules = [DocsScraper] +``` \ No newline at end of file diff --git a/src/DocsScraper.jl b/src/DocsScraper.jl index 40bc3ee..7f114d9 100644 --- a/src/DocsScraper.jl +++ b/src/DocsScraper.jl @@ -9,20 +9,23 @@ using LinearAlgebra, Unicode, SparseArrays using HDF5 using Tar using Inflate - using SHA using Serialization, URIs +using Dates +using JSON include("parser.jl") include("crawl.jl") include("extract_urls.jl") include("preparation.jl") +include("extract_package_name.jl") +export get_package_name include("make_knowledge_packs.jl") -export make_knowledge_packs, just_generate +export make_knowledge_packs include("user_preferences.jl") include("utils.jl") -export remove_urls_from_index +export remove_urls_from_index, urls_for_metadata -end \ No newline at end of file +end diff --git a/src/crawl.jl b/src/crawl.jl index a8f93c9..c972ef2 100644 --- a/src/crawl.jl +++ b/src/crawl.jl @@ -5,7 +5,7 @@ Parse the robots.txt string and return rules and the URLs on Sitemap """ function parse_robots_txt!(robots_txt::String) - rules = Dict{String,Dict{String,Vector{String}}}() + rules = Dict{String, Dict{String, Vector{String}}}() current_user_agent = "" sitemap_urls = Vector{AbstractString}() @@ -14,7 +14,8 @@ function parse_robots_txt!(robots_txt::String) if startswith(line, "User-agent:") current_user_agent = strip(split(line, ":")[2]) if !haskey(rules, current_user_agent) - rules[current_user_agent] = Dict("Disallow" => Vector{String}(), "Allow" => Vector{String}()) + rules[current_user_agent] = Dict( + "Disallow" => Vector{String}(), "Allow" => Vector{String}()) end elseif startswith(line, "Disallow:") disallow_path = strip(split(line, ":")[2]) @@ -30,12 +31,10 @@ function parse_robots_txt!(robots_txt::String) url = strip(split(line, ":")[2]) push!(sitemap_urls, url) end - end return rules, sitemap_urls end - """ check_robots_txt(user_agent::AbstractString, url::AbstractString) @@ -99,14 +98,12 @@ end Extract the base url """ function get_base_url(url::AbstractString) - parsed_url = URIs.URI(url) base_url = string(parsed_url.scheme, "://", parsed_url.host, parsed_url.port != nothing ? "" * string(parsed_url.port) : "", parsed_url.path) return base_url end - """ process_hostname(url::AbstractString) @@ -118,7 +115,6 @@ function process_hostname(url::AbstractString) return hostname end - """ process_hostname(url::AbstractString, hostname_dict::Dict{AbstractString,Vector{AbstractString}}) @@ -128,7 +124,8 @@ Add `url` to its hostname in `hostname_dict` - `url`: URL string - `hostname_dict`: Dict with key being hostname and value being a vector of URLs """ -function process_hostname!(url::AbstractString, hostname_dict::Dict{AbstractString,Vector{AbstractString}}) +function process_hostname!( + url::AbstractString, hostname_dict::Dict{AbstractString, Vector{AbstractString}}) hostname = process_hostname(url) # Add the URL to the dictionary under its hostname @@ -139,17 +136,15 @@ function process_hostname!(url::AbstractString, hostname_dict::Dict{AbstractStri end end - """ crawl(input_urls::Vector{<:AbstractString}) Crawl on the input URLs and return a `hostname_url_dict` which is a dictionary with key being hostnames and the values being the URLs """ function crawl(input_urls::Vector{<:AbstractString}) - url_queue = Vector{AbstractString}(input_urls) visited_url_set = Set{AbstractString}() - hostname_url_dict = Dict{AbstractString,Vector{AbstractString}}() + hostname_url_dict = Dict{AbstractString, Vector{AbstractString}}() sitemap_urls = Vector{AbstractString}() # TODO: Add parallel processing for URLs @@ -174,5 +169,4 @@ function crawl(input_urls::Vector{<:AbstractString}) end return hostname_url_dict, visited_url_set - end diff --git a/src/extract_package_name.jl b/src/extract_package_name.jl new file mode 100644 index 0000000..525cecf --- /dev/null +++ b/src/extract_package_name.jl @@ -0,0 +1,162 @@ +""" + clean_url(url::String) + +Strip URL of any http:// ot https:// or www. prefixes +""" +function clean_url(url::String) + # Remove http://, https://, www., or wwws. + cleaned_url = replace(url, r"^https?://(www\d?\.)?" => "") + return cleaned_url +end + +""" + base_url_segment(url::String) + +Return the base url and first path segment if all the other checks fail +""" +function base_url_segment(url::String) + # Clean the URL from unwanted prefixes + cleaned_url = clean_url(url) + + # Parse the cleaned URL + uri = URI("https://" * cleaned_url) # Add https:// to ensure correct parsing + + # Extract the base URL (host) + base_url = replace(uri.host, r"^www\." => "") + + # Extract the first path segment + path_segments = split(uri.path, "/"; keepempty = false) + + if !isempty(path_segments) + first_segment = path_segments[1] + return "$base_url/$first_segment" + else + return base_url + end +end + +""" + url_package_name(url::AbstractString) + +Return the text if the URL itself contains the package name with ".jl" or "_jl" suffixes +""" +function url_package_name(url::AbstractString) + if occursin(r"\.jl", url) || occursin(r"_jl", url) + package_name = match(r"[\/]([^\/]+(?:\.jl|_jl))", url) + return package_name.captures[1] + end + return "" +end + +""" + get_base_url(url::AbstractString) + +Extract the base url +""" +function get_base_url(url::AbstractString) + parsed_url = URIs.URI(url) + base_url = string(parsed_url.scheme, "://", parsed_url.host, + parsed_url.port != nothing ? ":" * string(parsed_url.port) : "", parsed_url.path) + return base_url +end + +""" + nav_bar(url::AbstractString) + +Julia doc websites tend to have the package name under ".docs-package-name" class in the HTML tree +""" +function nav_bar(url::AbstractString) + base_url = get_base_url(url) + fetched_content = HTTP.get(base_url) + parsed = Gumbo.parsehtml(String(fetched_content.body)) + content_candidates = [el + for el in AbstractTrees.PreOrderDFS(parsed.root) + if el isa HTMLElement] + content_by_class = filter( + el -> getattr(el, "class", nothing) in ["docs-package-name"], content_candidates) + if (!isempty(content_by_class)) + parsed_blocks = Vector{Dict{String, Any}}([Dict("Source" => base_url)]) + heading_hierarchy = Dict{Symbol, Any}() + process_node!(only(content_by_class), heading_hierarchy, parsed_blocks) + package_name = parsed_blocks[2]["text"] + return package_name + end + return "" +end + +""" + text_before_version(url::AbstractString) + +Return text before "stable" or "dev" or any version in URL. It is generally observed that doc websites have package names before their versions +""" +function text_before_version(url::AbstractString) + language_prefixes = [ + "/en/", "/es/", "/fr/", "/de/", "/it/", "/pt/", "/ru/", "/zh/", "/ja/", "/ko/"] + contains_prefix = any(occursin(prefix, url) for prefix in language_prefixes) + if contains_prefix + pattern = r"/([^/]+)/([^/]+)/(?:stable|dev|latest|v\d+(\.\d+)*)(?:/|$)" + else + pattern = r"/([^/]+)/(?:stable|dev|latest|v\d+(\.\d+)*)" + end + package_name = match(pattern, url) + if package_name !== nothing + return package_name.captures[1] + end + return "" +end + +""" + docs_in_url(url::AbstractString) + +If the base url is in the form docs.package_name.domain_extension, then return the middle word i.e., package_name +""" +function docs_in_url(url::AbstractString) + cleaned_url = clean_url(url) + + # Parse the cleaned URL + uri = URI("https://" * cleaned_url) # Add https:// to ensure correct parsing + + # Extract the base URL (host) + base_url = replace(uri.host, r"^www\." => "") + pattern = r"docs\.([^.]+)\.(org|com|ai|net|io|co|tech)" + m = match(pattern, base_url) + if m !== nothing + return m.captures[1] + end + return "" +end + +""" + get_package_name(url::AbstractString) + +Return name of the package through the package URL +""" +function get_package_name(url::AbstractString) + + # try 1: look for package name in URL + package_name = url_package_name(url) + if (!isempty(package_name)) + return package_name + end + + # try 2: look for package name in nav bar + package_name = nav_bar(url) + if (!isempty(package_name)) + return package_name + end + + # try 3: if the base url is in the form docs.package_name.domain_extension + package_name = docs_in_url(url) + if (!isempty(package_name)) + return package_name + end + + # try 4: get text before "stable" or "dev" or any version in URL + package_name = text_before_version(url) + if (!isempty(package_name)) + return package_name + end + + # fallback: return base URL with first path segment + return base_url_segment(url) +end diff --git a/src/extract_urls.jl b/src/extract_urls.jl index d5e8fcf..d750f34 100644 --- a/src/extract_urls.jl +++ b/src/extract_urls.jl @@ -141,4 +141,4 @@ function get_urls!(url::AbstractString, url_queue::Vector{<:AbstractString}) else find_urls_html!(url, parsed.root, url_queue) end -end \ No newline at end of file +end diff --git a/src/make_knowledge_packs.jl b/src/make_knowledge_packs.jl index 291a9c7..5d56ff8 100644 --- a/src/make_knowledge_packs.jl +++ b/src/make_knowledge_packs.jl @@ -24,8 +24,8 @@ function create_output_folders(knowledge_pack_path::String) end """ - make_chunks(hostname_url_dict::Dict{AbstractString,Vector{AbstractString}}, knowledge_pack_path::String; max_chunk_size::Int=MAX_CHUNK_SIZE, - min_chunk_size::Int=MIN_CHUNK_SIZE) + make_chunks(hostname_url_dict::Dict{AbstractString,Vector{AbstractString}}, knowledge_pack_path::String; + max_chunk_size::Int=MAX_CHUNK_SIZE, min_chunk_size::Int=MIN_CHUNK_SIZE) Parse URLs from hostname_url_dict and save the chunks @@ -44,7 +44,8 @@ function make_chunks(hostname_url_dict::Dict{AbstractString, Vector{AbstractStri output_sources = Vector{String}() for url in urls try - chunks, sources = process_paths(url; max_chunk_size, min_chunk_size) + chunks, sources = process_paths( + url; max_chunk_size, min_chunk_size) append!(output_chunks, chunks) append!(output_sources, sources) catch @@ -85,16 +86,20 @@ function l2_norm_columns(vect::AbstractVector) end """ - generate_embeddings(knowledge_pack_path::String; model::AbstractString=MODEL, embedding_size::Int=EMBEDDING_SIZE) + generate_embeddings(knowledge_pack_path::String; model::AbstractString=MODEL, + embedding_size::Int=EMBEDDING_SIZE, custom_metadata::AbstractString) Deserialize chunks and sources to generate embeddings # Arguments - model: Embedding model - embedding_size: Embedding dimensions +- custom_metadata: Custom metadata like ecosystem name if required """ -function generate_embeddings(knowledge_pack_path::String; model::AbstractString = MODEL, - embedding_size::Int = EMBEDDING_SIZE) +function generate_embeddings( + knowledge_pack_path::String; max_chunk_size::Int = MAX_CHUNK_SIZE, + model::AbstractString = MODEL, + embedding_size::Int = EMBEDDING_SIZE, custom_metadata::AbstractString) embedder = RT.BatchEmbedder() entries = readdir(knowledge_pack_path) # Initialize a dictionary to group files by hostname and chunk size @@ -114,31 +119,31 @@ function generate_embeddings(knowledge_pack_path::String; model::AbstractString if match_chunks !== nothing hostname = match_chunks.captures[1] - chunk_size = parse(Int, match_chunks.captures[2]) + max_chunk_size = parse(Int, match_chunks.captures[2]) if !haskey(hostname_files, hostname) hostname_files[hostname] = Dict{Int, Dict{String, String}}() end - if !haskey(hostname_files[hostname], chunk_size) - hostname_files[hostname][chunk_size] = Dict{String, String}() + if !haskey(hostname_files[hostname], max_chunk_size) + hostname_files[hostname][max_chunk_size] = Dict{String, String}() end - hostname_files[hostname][chunk_size]["chunks"] = joinpath( + hostname_files[hostname][max_chunk_size]["chunks"] = joinpath( knowledge_pack_path, file) elseif match_sources !== nothing hostname = match_sources.captures[1] - chunk_size = parse(Int, match_sources.captures[2]) + max_chunk_size = parse(Int, match_sources.captures[2]) if !haskey(hostname_files, hostname) hostname_files[hostname] = Dict{Int, Dict{String, String}}() end - if !haskey(hostname_files[hostname], chunk_size) - hostname_files[hostname][chunk_size] = Dict{String, String}() + if !haskey(hostname_files[hostname], max_chunk_size) + hostname_files[hostname][max_chunk_size] = Dict{String, String}() end - hostname_files[hostname][chunk_size]["sources"] = joinpath( + hostname_files[hostname][max_chunk_size]["sources"] = joinpath( knowledge_pack_path, file) end end # Process each pair of files for (hostname, chunk_files) in hostname_files - for (chunk_size, files) in chunk_files + for (max_chunk_size, files) in chunk_files if haskey(files, "chunks") && haskey(files, "sources") chunks_file = files["chunks"] sources_file = files["sources"] @@ -148,17 +153,31 @@ function generate_embeddings(knowledge_pack_path::String; model::AbstractString full_embeddings = RT.get_embeddings( embedder, chunks; model, verbose = false, cost_tracker) @info "Created embeddings for $hostname. Cost: \$$(round(cost_tracker[], digits=3))" + + trunc = embedding_size < EMBEDDING_SIZE ? 1 : 0 fn_output = joinpath(knowledge_pack_path, "packs", - "$hostname-textembedding3large-0-Float32__v1.0.tar.gz") + "$hostname-$model-$trunc-Float32__v1.0.tar.gz") fn_temp = joinpath(knowledge_pack_path, "packs", - "$hostname-textembedding3large-0-Float32__v1.0.hdf5") + "$hostname-$model-$trunc-Float32__v1.0.hdf5") + h5open(fn_temp, "w") do file file["chunks"] = chunks file["sources"] = sources file["embeddings"] = full_embeddings[1:embedding_size, :] |> l2_norm_columns |> x -> map(>(0), x) file["type"] = "ChunkIndex" - # file["metadata"] = "$hostname ecosystem docstrings, chunk size $chunk_size, downloaded on 20240330, contains: Makie.jl, AlgebraOfGraphics.jl, GeoMakie.jl, GraphMakie.jl, MakieThemes.jl, TopoPlots.jl, Tyler.jl" + + package_url_dict = Dict{String, Vector{String}}() + package_url_dict = urls_for_metadata(sources) + + metadata = Dict( + :embedded_dt => Dates.today(), + :custom_metadata => custom_metadata, :max_chunk_size => max_chunk_size, + :embedding_size => embedding_size, :model => model, + :packages => package_url_dict) + + metadata_json = JSON.json(metadata) + file["metadata"] = metadata_json end command = `tar -cvzf $fn_output -C $(dirname(fn_temp)) $(basename(fn_temp))` @@ -166,7 +185,7 @@ function generate_embeddings(knowledge_pack_path::String; model::AbstractString report_artifact(fn_output) else - @warn "Missing pair for hostname: $hostname, chunk size: $chunk_size" + @warn "Missing pair for hostname: $hostname, max chunk size: $max_chunk_size" end end end @@ -174,7 +193,8 @@ end """ make_knowledge_packs(crawlable_urls::Vector{<:AbstractString}=String[]; single_urls::Vector{<:AbstractString}=String[], - max_chunk_size::Int=MAX_CHUNK_SIZE, min_chunk_size::Int=MIN_CHUNK_SIZE, model::AbstractString=MODEL, embedding_size::Int=EMBEDDING_SIZE) + max_chunk_size::Int=MAX_CHUNK_SIZE, min_chunk_size::Int=MIN_CHUNK_SIZE, model::AbstractString=MODEL, embedding_size::Int=EMBEDDING_SIZE, + custom_metadata::AbstractString) Entry point to crawl, parse and generate embeddings @@ -185,11 +205,12 @@ Entry point to crawl, parse and generate embeddings - min_chunk_size: Minimum chunk size - model: Embedding model - embedding_size: Embedding dimensions +- custom_metadata: Custom metadata like ecosystem name if required """ function make_knowledge_packs(crawlable_urls::Vector{<:AbstractString} = String[]; single_urls::Vector{<:AbstractString} = String[], max_chunk_size::Int = MAX_CHUNK_SIZE, min_chunk_size::Int = MIN_CHUNK_SIZE, - model::AbstractString = MODEL, embedding_size::Int = EMBEDDING_SIZE) + model::AbstractString = MODEL, embedding_size::Int = EMBEDDING_SIZE, custom_metadata::AbstractString = "") if isempty(crawlable_urls) && isempty(single_urls) error("At least one of `input_urls` or `single_pages` must be provided.") end @@ -217,6 +238,8 @@ function make_knowledge_packs(crawlable_urls::Vector{<:AbstractString} = String[ end knowledge_pack_path = joinpath(@__DIR__, "..", "knowledge_packs") create_output_folders(knowledge_pack_path) - make_chunks(hostname_url_dict, knowledge_pack_path; max_chunk_size, min_chunk_size) - generate_embeddings(knowledge_pack_path; model, embedding_size) + make_chunks( + hostname_url_dict, knowledge_pack_path; max_chunk_size, min_chunk_size) + generate_embeddings( + knowledge_pack_path; max_chunk_size, model, embedding_size, custom_metadata) end diff --git a/src/parser.jl b/src/parser.jl index def1a17..2de7035 100644 --- a/src/parser.jl +++ b/src/parser.jl @@ -340,9 +340,6 @@ end parse_url(url::AbstractString) Initiator and main function to parse HTML from url. Return a Vector of Dict containing Heading/Text/Code along with a Dict of respective metadata - -# Arguments -- `url`: URL string to parse """ function parse_url_to_blocks(url::AbstractString) @@ -356,7 +353,7 @@ function parse_url_to_blocks(url::AbstractString) # Getting title of the document # title = [el # for el in AbstractTrees.PreOrderDFS(r_parsed.root) - # if el isa HTMLElement && tag(el) == :title] .|> text |> Base.Fix2(join, " / ") + # if el isa HTMLElement && tag(el) == :title] .|> text |> Base.Fix2(join, " / ") parsed_blocks = Vector{Dict{String, Any}}([Dict("Source" => base_url)]) heading_hierarchy = Dict{Symbol, Any}() process_node!(get_html_content(parsed.root), heading_hierarchy, parsed_blocks) diff --git a/src/preparation.jl b/src/preparation.jl index 9979155..8736050 100644 --- a/src/preparation.jl +++ b/src/preparation.jl @@ -10,22 +10,21 @@ get_header_path(d) # Output: "Axis/Attributes/yzoomkey" ``` """ -function get_header_path(d::Dict{String,Any}) - metadata = get(d, "metadata", Dict{Any,Any}()) +function get_header_path(d::Dict{String, Any}) + metadata = get(d, "metadata", Dict{Any, Any}()) isempty(metadata) && return nothing keys_ = [:h1, :h2, :h3] vals = get.(Ref(metadata), keys_, "") |> x -> filter(!isempty, x) |> x -> join(x, "/") isempty(vals) ? nothing : vals end - - """ roll_up_chunks(parsed_blocks::Vector{Dict{String,Any}}, url::AbstractString; separator::String="") Roll-up chunks (that have the same header!), so we can split them later by to get the desired length """ -function roll_up_chunks(parsed_blocks::Vector{Dict{String,Any}}, url::AbstractString; separator::String="") +function roll_up_chunks(parsed_blocks::Vector{Dict{String, Any}}, + url::AbstractString; separator::String = "") docs = String[] io = IOBuffer() last_header = nothing @@ -57,7 +56,6 @@ function roll_up_chunks(parsed_blocks::Vector{Dict{String,Any}}, url::AbstractSt return docs, sources end - struct DocParserChunker <: RT.AbstractChunker end """ @@ -74,9 +72,9 @@ and splits them by separators to get the desired length. - separators: Chunk separators - max_chunk_size Maximum chunk size """ -function RT.get_chunks(chunker::DocParserChunker, url::AbstractString; - verbose::Bool=true, separators=["\n\n", ". ", "\n", " "], max_chunk_size::Int=MAX_CHUNK_SIZE) - +function RT.get_chunks( + chunker::DocParserChunker, url::AbstractString; + verbose::Bool = true, separators = ["\n\n", ". ", "\n", " "], max_chunk_size::Int = MAX_CHUNK_SIZE) SEP = "" sources = AbstractVector{<:AbstractString} output_chunks = Vector{SubString{String}}() @@ -86,14 +84,14 @@ function RT.get_chunks(chunker::DocParserChunker, url::AbstractString; parsed_blocks = parse_url_to_blocks(url) ## Roll up to the same header - docs_, sources_ = roll_up_chunks(parsed_blocks, url; separator=SEP) + docs_, sources_ = roll_up_chunks(parsed_blocks, url; separator = SEP) ## roll up chunks by SEP splitter, then remove it later for (doc, src) in zip(docs_, sources_) ## roll up chunks by SEP splitter, then remove it later - doc_chunks = PT.recursive_splitter(doc, [SEP, separators...]; max_length=max_chunk_size) .|> + doc_chunks = PT.recursive_splitter( + doc, [SEP, separators...]; max_length = max_chunk_size) .|> x -> replace(x, SEP => " ") .|> strip |> x -> filter(!isempty, x) - chunk_lengths = length.(doc_chunks) # skip if no chunks found isempty(doc_chunks) && continue append!(output_chunks, doc_chunks) @@ -102,15 +100,14 @@ function RT.get_chunks(chunker::DocParserChunker, url::AbstractString; return output_chunks, output_sources end - - """ process_paths(url::AbstractString; max_chunk_size::Int=MAX_CHUNK_SIZE, min_chunk_size::Int=MIN_CHUNK_SIZE) Process folders provided in `paths`. In each, take all HTML files, scrape them, chunk them and postprocess them. """ -function process_paths(url::AbstractString; max_chunk_size::Int=MAX_CHUNK_SIZE, min_chunk_size::Int=MIN_CHUNK_SIZE) - +function process_paths(url::AbstractString; + max_chunk_size::Int = MAX_CHUNK_SIZE, + min_chunk_size::Int = MIN_CHUNK_SIZE) output_chunks = Vector{SubString{String}}() output_sources = Vector{String}() @@ -119,9 +116,9 @@ function process_paths(url::AbstractString; max_chunk_size::Int=MAX_CHUNK_SIZE, append!(output_chunks, chunks) append!(output_sources, sources) - @info "Scraping done: $(length(output_chunks)) chunks" - output_chunks, output_sources = postprocess_chunks(output_chunks, output_sources; min_chunk_size, skip_code=true) + output_chunks, output_sources = postprocess_chunks( + output_chunks, output_sources; min_chunk_size, skip_code = true) return output_chunks, output_sources end diff --git a/src/user_preferences.jl b/src/user_preferences.jl index 98794c6..00c1a2f 100644 --- a/src/user_preferences.jl +++ b/src/user_preferences.jl @@ -1,4 +1,4 @@ global MIN_CHUNK_SIZE = 40 -global MAX_CHUNK_SIZE = 256 +global MAX_CHUNK_SIZE = 384 global MODEL = "text-embedding-3-large" -global EMBEDDING_SIZE = 1024 \ No newline at end of file +global EMBEDDING_SIZE = 3072 diff --git a/src/utils.jl b/src/utils.jl index e8dc014..dfbc17c 100644 --- a/src/utils.jl +++ b/src/utils.jl @@ -30,21 +30,21 @@ end Remove chunks that are duplicated in the input list of chunks and their corresponding sources. """ -function remove_duplicates(chunks::AbstractVector{<:AbstractString}, sources::AbstractVector{<:AbstractString}) +function remove_duplicates( + chunks::AbstractVector{<:AbstractString}, sources::AbstractVector{<:AbstractString}) idxs = find_duplicates(chunks) return chunks[.!idxs], sources[.!idxs] end - """ remove_short_chunks(chunks::AbstractVector{<:AbstractString}, sources::AbstractVector{<:AbstractString}; min_chunk_size::Int=MIN_CHUNK_SIZE, skip_code::Bool=true) Remove chunks that are shorter than a specified length (`min_length`) from the input list of chunks and their corresponding sources. """ -function remove_short_chunks(chunks::AbstractVector{<:AbstractString}, sources::AbstractVector{<:AbstractString}; - min_chunk_size::Int=MIN_CHUNK_SIZE, skip_code::Bool=true) - +function remove_short_chunks( + chunks::AbstractVector{<:AbstractString}, sources::AbstractVector{<:AbstractString}; + min_chunk_size::Int = MIN_CHUNK_SIZE, skip_code::Bool = true) chunk_lengths = length.(chunks) idx = if skip_code ## Keep short chunks if they contain code (might be combined with some preceding/succeeding text) @@ -56,17 +56,15 @@ function remove_short_chunks(chunks::AbstractVector{<:AbstractString}, sources:: return chunks[idx], sources[idx] end - -function replace_local_paths(sources::AbstractVector{<:AbstractString}, paths::AbstractVector{<:AbstractString}, websites::AbstractVector{<:AbstractString}) - @assert length(paths) == length(websites) "Length of `paths` must match length of `websites`" +function replace_local_paths( + sources::AbstractVector{<:AbstractString}, paths::AbstractVector{<:AbstractString}, + websites::AbstractVector{<:AbstractString}) + @assert length(paths)==length(websites) "Length of `paths` must match length of `websites`" replacement_pairs = paths .=> websites output = map(x -> replace(x, replacement_pairs...), sources) return output end - - - """ function postprocess_chunks(chunks::AbstractVector{<:AbstractString}, sources::AbstractVector{<:AbstractString}; min_chunk_size::Int=MIN_CHUNK_SIZE, skip_code::Bool=true, paths::Union{Nothing,AbstractVector{<:AbstractString}}=nothing, @@ -74,9 +72,11 @@ end Post-process the input list of chunks and their corresponding sources by removing short chunks and duplicates. """ -function postprocess_chunks(chunks::AbstractVector{<:AbstractString}, sources::AbstractVector{<:AbstractString}; - min_chunk_size::Int=MIN_CHUNK_SIZE, skip_code::Bool=true, paths::Union{Nothing,AbstractVector{<:AbstractString}}=nothing, - websites::Union{Nothing,AbstractVector{<:AbstractString}}=nothing) +function postprocess_chunks( + chunks::AbstractVector{<:AbstractString}, sources::AbstractVector{<:AbstractString}; + min_chunk_size::Int = MIN_CHUNK_SIZE, skip_code::Bool = true, + paths::Union{Nothing, AbstractVector{<:AbstractString}} = nothing, + websites::Union{Nothing, AbstractVector{<:AbstractString}} = nothing) len_ = length(chunks) chunks, sources = remove_short_chunks(chunks, sources; min_chunk_size, skip_code) @info "Removed $(len_ - length(chunks)) short chunks" @@ -99,7 +99,8 @@ end Remove chunks and sources corresponding to URLs starting with `prefix_urls` """ -function remove_urls_from_index(index_path::AbstractString, prefix_urls=Vector{<:AbstractString}) +function remove_urls_from_index( + index_path::AbstractString, prefix_urls = Vector{<:AbstractString}) @assert endswith(file_path, ".hdf5") "Provided file path must end with `.hdf5` (see HDF5.jl)." h5open(index_path, "r+") do orig_file @@ -119,4 +120,43 @@ function remove_urls_from_index(index_path::AbstractString, prefix_urls=Vector{< write(file["chunks"], chunks) write(file["embeddings"], embeddings) end -end \ No newline at end of file +end + +""" + urls_for_metadata(sources::Vector{String}) + +Return a Dict of package names with their associated URLs +Note: Due to their large number, URLs are stripped down to the package name; Package subpaths are not included in metadata. +""" +function urls_for_metadata(sources::Vector{String}) + urls = [split(source, " -")[1] for source in sources] + pattern = r"(/(?:stable|dev|latest|v\d+(?:\.\d+)*))" + cleaned_urls = [endswith(String(url), "/") ? String(url)[1:(end - 1)] : String(url) + for url in urls] + unique_urls = unique(cleaned_urls) + package_names = Vector{String}() + + for url in unique_urls + push!(package_names, get_package_name(String(url))) + end + + cleaned_urls = [match(pattern, url) !== nothing ? first(split(url, pattern)) : url + for url in unique_urls] + + zipped = zip(cleaned_urls, package_names) |> collect + unique_pairs = unique(zipped) + unique_urls = [pair[1] for pair in unique_pairs] + unique_package_names = [pair[2] for pair in unique_pairs] + + package_url_dict = Dict{String, Vector{String}}() + for (url, package_name) in zip(unique_urls, unique_package_names) + if haskey(package_url_dict, package_name) + # If the package_name is already a key, append the url to the existing array + push!(package_url_dict[package_name], url) + else + # Otherwise, create a new entry with the package_name and the url + package_url_dict[package_name] = [url] + end + end + return package_url_dict +end diff --git a/test/crawl.jl b/test/crawl.jl new file mode 100644 index 0000000..6b00ca4 --- /dev/null +++ b/test/crawl.jl @@ -0,0 +1,7 @@ +using DocsScraper: crawl + +@testset "crawl" begin + urls = Vector{AbstractString}(["https://docs.julialang.org/en/v1/"]) + hostname_url_dict = crawl(urls) + @test length(hostname_url_dict) > 0 +end diff --git a/test/make_knowledge_packs.jl b/test/make_knowledge_packs.jl new file mode 100644 index 0000000..5690725 --- /dev/null +++ b/test/make_knowledge_packs.jl @@ -0,0 +1,8 @@ +using DocsScraper: process_paths + +@testset "overall test" begin + url = "https://docs.julialang.org/en/v1/" + chunks, sources = process_paths(url) + @test length(chunks) > 0 && length(sources) > 0 && chunks[1] != nothing && + sources[1] != nothing +end diff --git a/test/parser.jl b/test/parser.jl new file mode 100644 index 0000000..0faeb04 --- /dev/null +++ b/test/parser.jl @@ -0,0 +1,11 @@ +using DocsScraper: parse_url_to_blocks, roll_up_chunks + +@testset "parse & roll_up" begin + url = "https://docs.julialang.org/en/v1/" + parsed_blocks = parse_url_to_blocks(url) + @test length(parsed_blocks) > 0 + SEP = "" + docs_, sources_ = roll_up_chunks(parsed_blocks, url; separator = SEP) + @test length(docs_) > 0 && length(sources_) > 0 && docs_[1] != nothing && + sources_[1] != nothing +end diff --git a/test/runtests.jl b/test/runtests.jl index 4b4a92c..6e1e7e8 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -1,51 +1,13 @@ - +using DocsScraper using Test -using HTTP, Gumbo, AbstractTrees, URIs -using Gumbo: HTMLDocument, HTMLElement -using EzXML -using PromptingTools -const PT = PromptingTools -const RT = PromptingTools.Experimental.RAGTools -using LinearAlgebra, Unicode, SparseArrays -using HDF5 -using Tar -using Inflate -using SHA -using Serialization, URIs - -include(joinpath("..", "src", "crawl.jl")) -include(joinpath("..", "src", "extract_urls.jl")) -include(joinpath("..", "src", "parser.jl")) -include(joinpath("..", "src", "preparation.jl")) -include(joinpath("..", "src", "user_preferences.jl")) -include(joinpath("..", "src", "utils.jl")) - -urls = Vector{AbstractString}(["https://docs.julialang.org/en/v1/"]) -url = urls[1] -queue = Vector{AbstractString}() - -@testset "HTTP" begin - @test HTTP.get(url) != nothing - result, sitemap_queue = check_robots_txt("*", url) - @test result == true -end - -@testset "crawl" begin - hostname_url_dict = crawl(urls) - @test length(hostname_url_dict) > 0 -end +using Aqua -@testset "parse & roll_up" begin - parsed_blocks = parse_url_to_blocks(url) - @test length(parsed_blocks) > 0 - SEP = "" - docs_, sources_ = roll_up_chunks(parsed_blocks, url; separator = SEP) - @test length(docs_) > 0 && length(sources_) > 0 && docs_[1] != nothing && - sources_[1] != nothing -end +@testset "DocsScraper.jl" begin + @testset "Code quality (Aqua.jl)" begin + Aqua.test_all(DocsScraper; persistent_tasks = false) + end -@testset "overall test" begin - chunks, sources = process_paths(url) - @test length(chunks) > 0 && length(sources) > 0 && chunks[1] != nothing && - sources[1] != nothing + include("crawl.jl") + include("parser.jl") + include("make_knowledge_packs.jl") end diff --git a/test/utils.jl b/test/utils.jl new file mode 100644 index 0000000..fbe338a --- /dev/null +++ b/test/utils.jl @@ -0,0 +1,10 @@ +using DocsScraper: parse_url_to_blocks, roll_up_chunks + +@testset "parse & roll_up" begin + parsed_blocks = parse_url_to_blocks(url) + @test length(parsed_blocks) > 0 + SEP = "" + docs_, sources_ = roll_up_chunks(parsed_blocks, url; separator = SEP) + @test length(docs_) > 0 && length(sources_) > 0 && docs_[1] != nothing && + sources_[1] != nothing +end From 965873abf5b48f7bac6e99036d6b23a79dd54985 Mon Sep 17 00:00:00 2001 From: Shreyas Agrawal <48771895+splendidbug@users.noreply.github.com> Date: Thu, 15 Aug 2024 19:33:35 -0700 Subject: [PATCH 6/7] create a single index file --- Project.toml | 6 +- src/DocsScraper.jl | 1 + src/make_knowledge_packs.jl | 140 +++++++++++++++++++++--------------- 3 files changed, 87 insertions(+), 60 deletions(-) diff --git a/Project.toml b/Project.toml index 1fb77c2..ef5aaa9 100644 --- a/Project.toml +++ b/Project.toml @@ -14,6 +14,7 @@ Inflate = "d25df0c9-e2be-5dd7-82c8-3ad0b3e990b9" JSON = "682c06a0-de6a-54ab-a142-c8b1cf79cde6" LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e" PromptingTools = "670122d1-24a8-4d70-bfce-740807c42192" +Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c" SHA = "ea8e919c-243c-51af-8825-aaa63cd721ce" Serialization = "9e88b42a-f829-5b0c-bbe9-9e923198166b" SparseArrays = "2f01184e-e22b-5df5-ae63-d93ebab69eaf" @@ -30,8 +31,10 @@ Gumbo = "0.8" HDF5 = "0.17" HTTP = "1.10" Inflate = "0.1" +JSON = "0.21" LinearAlgebra = "1" -PromptingTools = "0.48" +PromptingTools = "0.49" +Random = "1" SHA = "0.7" Serialization = "1" SparseArrays = "1" @@ -40,7 +43,6 @@ Test = "1" URIs = "1.5" Unicode = "1" julia = "1.10" -JSON = "0.21" [extras] Aqua = "4c88cf16-eb10-579e-8560-4a9242c79595" diff --git a/src/DocsScraper.jl b/src/DocsScraper.jl index 7f114d9..0a65d57 100644 --- a/src/DocsScraper.jl +++ b/src/DocsScraper.jl @@ -13,6 +13,7 @@ using SHA using Serialization, URIs using Dates using JSON +using Random include("parser.jl") include("crawl.jl") diff --git a/src/make_knowledge_packs.jl b/src/make_knowledge_packs.jl index 5d56ff8..a787edf 100644 --- a/src/make_knowledge_packs.jl +++ b/src/make_knowledge_packs.jl @@ -38,7 +38,6 @@ Parse URLs from hostname_url_dict and save the chunks function make_chunks(hostname_url_dict::Dict{AbstractString, Vector{AbstractString}}, knowledge_pack_path::String; max_chunk_size::Int = MAX_CHUNK_SIZE, min_chunk_size::Int = MIN_CHUNK_SIZE) - SAVE_CHUNKS = true for (hostname, urls) in hostname_url_dict output_chunks = Vector{SubString{String}}() output_sources = Vector{String}() @@ -52,16 +51,14 @@ function make_chunks(hostname_url_dict::Dict{AbstractString, Vector{AbstractStri @error "error!! check url: $url" end end - if SAVE_CHUNKS - serialize( - joinpath(knowledge_pack_path, - "$(hostname)-chunks-max-$(max_chunk_size)-min-$(min_chunk_size).jls"), - output_chunks) - serialize( - joinpath(knowledge_pack_path, - "$(hostname)-sources-max-$(max_chunk_size)-min-$(min_chunk_size).jls"), - output_sources) - end + serialize( + joinpath(knowledge_pack_path, + "$(hostname)-chunks-max-$(max_chunk_size)-min-$(min_chunk_size).jls"), + output_chunks) + serialize( + joinpath(knowledge_pack_path, + "$(hostname)-sources-max-$(max_chunk_size)-min-$(min_chunk_size).jls"), + output_sources) end end @@ -87,19 +84,24 @@ end """ generate_embeddings(knowledge_pack_path::String; model::AbstractString=MODEL, - embedding_size::Int=EMBEDDING_SIZE, custom_metadata::AbstractString) + embedding_size::Int=EMBEDDING_SIZE, custom_metadata::AbstractString, + bool_embeddings::Bool = true, index_name::AbstractString = "") Deserialize chunks and sources to generate embeddings +Note: We highly recommend to pass `index_name`. This will be the name of the generated index. Default: date-randomInt # Arguments - model: Embedding model - embedding_size: Embedding dimensions - custom_metadata: Custom metadata like ecosystem name if required +- bool_embeddings: If true, embeddings generated will be boolean, Float32 otherwise +- index_name: Name if the index. Default: date-randomInt """ function generate_embeddings( knowledge_pack_path::String; max_chunk_size::Int = MAX_CHUNK_SIZE, model::AbstractString = MODEL, - embedding_size::Int = EMBEDDING_SIZE, custom_metadata::AbstractString) + embedding_size::Int = EMBEDDING_SIZE, custom_metadata::AbstractString, + bool_embeddings::Bool = true, index_name::AbstractString = "") embedder = RT.BatchEmbedder() entries = readdir(knowledge_pack_path) # Initialize a dictionary to group files by hostname and chunk size @@ -109,9 +111,6 @@ function generate_embeddings( chunks_pattern = r"^(.*)-chunks-max-(\d+)-min-(\d+)\.jls$" sources_pattern = r"^(.*)-sources-max-(\d+)-min-(\d+)\.jls$" - # chunks_pattern = r"^(.*)-chunks-(\d+)\.jls$" - # sources_pattern = r"^(.*)-sources-(\d+)\.jls$" - # Group files by hostname and chunk size for file in entries match_chunks = match(chunks_pattern, file) @@ -141,62 +140,83 @@ function generate_embeddings( knowledge_pack_path, file) end end - # Process each pair of files + + chunks = Vector{SubString{String}}() + sources = Vector{String}() + + # Add chunks and sources to vectors from each of the scraped file for (hostname, chunk_files) in hostname_files for (max_chunk_size, files) in chunk_files if haskey(files, "chunks") && haskey(files, "sources") chunks_file = files["chunks"] sources_file = files["sources"] - chunks = deserialize(chunks_file) - sources = deserialize(sources_file) - cost_tracker = Threads.Atomic{Float64}(0.0) - full_embeddings = RT.get_embeddings( - embedder, chunks; model, verbose = false, cost_tracker) - @info "Created embeddings for $hostname. Cost: \$$(round(cost_tracker[], digits=3))" - - trunc = embedding_size < EMBEDDING_SIZE ? 1 : 0 - fn_output = joinpath(knowledge_pack_path, "packs", - "$hostname-$model-$trunc-Float32__v1.0.tar.gz") - fn_temp = joinpath(knowledge_pack_path, "packs", - "$hostname-$model-$trunc-Float32__v1.0.hdf5") - - h5open(fn_temp, "w") do file - file["chunks"] = chunks - file["sources"] = sources - file["embeddings"] = full_embeddings[1:embedding_size, :] |> - l2_norm_columns |> x -> map(>(0), x) - file["type"] = "ChunkIndex" - - package_url_dict = Dict{String, Vector{String}}() - package_url_dict = urls_for_metadata(sources) - - metadata = Dict( - :embedded_dt => Dates.today(), - :custom_metadata => custom_metadata, :max_chunk_size => max_chunk_size, - :embedding_size => embedding_size, :model => model, - :packages => package_url_dict) - - metadata_json = JSON.json(metadata) - file["metadata"] = metadata_json - end - - command = `tar -cvzf $fn_output -C $(dirname(fn_temp)) $(basename(fn_temp))` - run(command) - report_artifact(fn_output) - + append!(chunks, deserialize(chunks_file)) + append!(sources, deserialize(sources_file)) else @warn "Missing pair for hostname: $hostname, max chunk size: $max_chunk_size" end end end + + # Generate embeddings + cost_tracker = Threads.Atomic{Float64}(0.0) + full_embeddings = RT.get_embeddings( + embedder, chunks; model, verbose = false, cost_tracker) + + full_embeddings = full_embeddings[1:embedding_size, :] |> + l2_norm_columns + + if bool_embeddings + full_embeddings = map(>(0), full_embeddings) + end + + if isempty(index_name) + rand_int = rand(1000:100000) + date = Dates.today() + index_name = "$(date)-$(rand_int)" + end + + @info "Created embeddings for $index_name. Cost: \$$(round(cost_tracker[], digits=3))" + + trunc = embedding_size < EMBEDDING_SIZE ? 1 : 0 + emb_data_type = bool_embeddings ? "Bool" : "Float32" + + fn_output = joinpath(knowledge_pack_path, "packs", + "$index_name-$model-$trunc-$(emb_data_type)__v1.0.tar.gz") + fn_temp = joinpath(knowledge_pack_path, "packs", + "$index_name-$model-$trunc-$(emb_data_type)__v1.0.hdf5") + + h5open(fn_temp, "w") do file + file["chunks"] = chunks + file["sources"] = sources + file["embeddings"] = full_embeddings + file["type"] = "ChunkIndex" + + package_url_dict = Dict{String, Vector{String}}() + package_url_dict = urls_for_metadata(sources) + + metadata = Dict( + :embedded_dt => Dates.today(), + :custom_metadata => custom_metadata, :max_chunk_size => max_chunk_size, + :embedding_size => embedding_size, :model => model, + :packages => package_url_dict) + + metadata_json = JSON.json(metadata) + file["metadata"] = metadata_json + end + + command = `tar -cvzf $fn_output -C $(dirname(fn_temp)) $(basename(fn_temp))` + run(command) + report_artifact(fn_output) end """ make_knowledge_packs(crawlable_urls::Vector{<:AbstractString}=String[]; single_urls::Vector{<:AbstractString}=String[], max_chunk_size::Int=MAX_CHUNK_SIZE, min_chunk_size::Int=MIN_CHUNK_SIZE, model::AbstractString=MODEL, embedding_size::Int=EMBEDDING_SIZE, - custom_metadata::AbstractString) + custom_metadata::AbstractString, bool_embeddings::Bool = true, index_name::AbstractString = "") -Entry point to crawl, parse and generate embeddings +Entry point to crawl, parse and generate embeddings. +Note: We highly recommend to pass `index_name`. This will be the name of the generated index. Default: date-randomInt # Arguments - crawlable_urls: URLs that should be crawled to find more links @@ -206,11 +226,14 @@ Entry point to crawl, parse and generate embeddings - model: Embedding model - embedding_size: Embedding dimensions - custom_metadata: Custom metadata like ecosystem name if required +- bool_embeddings: If true, embeddings generated will be boolean, Float32 otherwise +- index_name: Name if the index. Default: date-randomInt """ function make_knowledge_packs(crawlable_urls::Vector{<:AbstractString} = String[]; single_urls::Vector{<:AbstractString} = String[], max_chunk_size::Int = MAX_CHUNK_SIZE, min_chunk_size::Int = MIN_CHUNK_SIZE, - model::AbstractString = MODEL, embedding_size::Int = EMBEDDING_SIZE, custom_metadata::AbstractString = "") + model::AbstractString = MODEL, embedding_size::Int = EMBEDDING_SIZE, custom_metadata::AbstractString = "", + bool_embeddings::Bool = true, index_name::AbstractString = "") if isempty(crawlable_urls) && isempty(single_urls) error("At least one of `input_urls` or `single_pages` must be provided.") end @@ -241,5 +264,6 @@ function make_knowledge_packs(crawlable_urls::Vector{<:AbstractString} = String[ make_chunks( hostname_url_dict, knowledge_pack_path; max_chunk_size, min_chunk_size) generate_embeddings( - knowledge_pack_path; max_chunk_size, model, embedding_size, custom_metadata) + knowledge_pack_path; max_chunk_size, model, embedding_size, + custom_metadata, bool_embeddings, index_name) end From b2c629ab64599da6be4e0f08b46d2e5567fbd8a6 Mon Sep 17 00:00:00 2001 From: Shreyas Shirish Agrawal <48771895+splendidbug@users.noreply.github.com> Date: Thu, 15 Aug 2024 20:19:03 -0700 Subject: [PATCH 7/7] Update Project.toml --- Project.toml | 1 - 1 file changed, 1 deletion(-) diff --git a/Project.toml b/Project.toml index 3cbe2e5..ef5aaa9 100644 --- a/Project.toml +++ b/Project.toml @@ -43,7 +43,6 @@ Test = "1" URIs = "1.5" Unicode = "1" julia = "1.10" -JSON = "0.21" [extras] Aqua = "4c88cf16-eb10-579e-8560-4a9242c79595"