From ed41636c855fb24cedbc7b5a99b34a089369aecc Mon Sep 17 00:00:00 2001
From: Shreyas Agrawal <48771895+splendidbug@users.noreply.github.com>
Date: Wed, 10 Jul 2024 11:49:50 -0700
Subject: [PATCH 1/7] structured according to PkgTemplate

---
 .github/workflows/CI.yml | 81 ++++++++++++++++++++++++++++++++++++++++
 MIT                      | 21 +++++++++++
 Project.toml             |  3 ++
 docs/Project.toml        | 20 ++++++++++
 docs/make.jl             | 24 ++++++++++++
 docs/src/index.md        |  4 ++
 src/make_embeddings.jl   | 18 +++++++--
 test/runtests.jl         | 30 +++++++++++----
 8 files changed, 190 insertions(+), 11 deletions(-)
 create mode 100644 .github/workflows/CI.yml
 create mode 100644 MIT
 create mode 100644 docs/Project.toml
 create mode 100644 docs/make.jl
 create mode 100644 docs/src/index.md
diff --git a/.github/workflows/CI.yml b/.github/workflows/CI.yml
new file mode 100644
index 0000000..371b418
--- /dev/null
+++ b/.github/workflows/CI.yml
@@ -0,0 +1,81 @@
+name: CI
+on:
+  push:
+    branches:
+      - main
+    tags: ['*']
+  pull_request:
+  workflow_dispatch:
+concurrency:
+  # Skip intermediate builds: always.
+  # Cancel intermediate builds: only if it is a pull request build.
+  group: ${{ github.workflow }}-${{ github.ref }}
+  cancel-in-progress: ${{ startsWith(github.ref, 'refs/pull/') }}
+jobs:
+  test:
+    name: Julia ${{ matrix.version }} - ${{ matrix.os }} - ${{ matrix.arch }} - ${{ github.event_name }}
+    runs-on: ${{ matrix.os }}
+    timeout-minutes: 60
+    permissions: # needed to allow julia-actions/cache to proactively delete old caches that it has created
+      actions: write
+      contents: read
+    strategy:
+      fail-fast: false
+      matrix:
+        version:
+          <<#VERSIONS>>
+          - '<<&.>>'
+          <</VERSIONS>>
+        os:
+          - ubuntu-latest
+        arch:
+          - x64
+    steps:
+      - uses: actions/checkout@v4
+      - uses: julia-actions/setup-julia@v2
+        with:
+          version: ${{ matrix.version }}
+          arch: ${{ matrix.arch }}
+      - uses: julia-actions/cache@v2
+      - uses: julia-actions/julia-buildpkg@v1
+      - uses: julia-actions/julia-runtest@v1
+      - uses: julia-actions/julia-processcoverage@v1
+      - uses: codecov/codecov-action@v4
+        with:
+          files: lcov.info
+          token: ${{ secrets.CODECOV_TOKEN }}
+          fail_ci_if_error: false
+  docs:
+    name: Documentation
+    runs-on: ubuntu-latest
+    permissions:
+      actions: write # needed to allow julia-actions/cache to proactively delete old caches that it has created
+      contents: write
+      statuses: write
+      pages: write
+      id-token: write
+    steps:
+      - uses: actions/checkout@v4
+      - uses: julia-actions/setup-julia@v2
+        with:
+          version: '1'
+      - uses: julia-actions/cache@v2
+      - name: Configure doc environment
+        shell: julia --project=docs --color=yes {0}
+        run: |
+          using Pkg
+          Pkg.develop(PackageSpec(path=pwd()))
+          Pkg.instantiate()
+      - uses: julia-actions/julia-buildpkg@v1
+      - uses: julia-actions/julia-docdeploy@v1
+        env:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+          DOCUMENTER_KEY: ${{ secrets.DOCUMENTER_KEY }}
+      - name: Run doctests
+        shell: julia --project=docs --color=yes {0}
+        run: |
+          using Documenter: DocMeta, doctest
+          using <<&PKG>>
+          DocMeta.setdocmeta!(<<&PKG>>, :DocTestSetup, :(using <<&PKG>>); recursive=true)
+          doctest(<<&PKG>>)
+  <</HAS_DOCUMENTER>>
diff --git a/MIT b/MIT
new file mode 100644
index 0000000..775ba1d
--- /dev/null
+++ b/MIT
@@ -0,0 +1,21 @@
+MIT License
+
+Copyright (c) {{{YEAR}}} {{{AUTHORS}}}
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
diff --git a/Project.toml b/Project.toml
index 964d069..0c1f6a8 100644
--- a/Project.toml
+++ b/Project.toml
@@ -11,7 +11,9 @@ Gumbo = "708ec375-b3d6-5a57-a7ce-8257bf98657a"
 HDF5 = "f67ccb44-e63f-5c2f-98bd-6dc0ccc4ba2f"
 HTTP = "cd3eb016-35fb-5094-929b-558a96fad6f3"
 Inflate = "d25df0c9-e2be-5dd7-82c8-3ad0b3e990b9"
+PkgTemplates = "14b8a8f1-9102-5b29-a752-f990bacb7fe1"
 PromptingTools = "670122d1-24a8-4d70-bfce-740807c42192"
+Tar = "a4e569a6-e804-4fa4-b0f3-eef7a1d5b13e"
 URIParser = "30578b45-9adc-5946-b283-645ec420af67"
 URIs = "5c2747f8-b7ea-4ff2-ba2e-563bfd36b1d4"
 
@@ -19,4 +21,5 @@ URIs = "5c2747f8-b7ea-4ff2-ba2e-563bfd36b1d4"
 AbstractTrees = "0.4.5"
 Gumbo = "0.8.2"
 HTTP = "1.10.4"
+PromptingTools = "0.36.0"
 URIs = "1.5.1"
diff --git a/docs/Project.toml b/docs/Project.toml
new file mode 100644
index 0000000..6fea155
--- /dev/null
+++ b/docs/Project.toml
@@ -0,0 +1,20 @@
+[deps]
+AbstractTrees = "1520ce14-60c1-5f80-bbc7-55ef81b5835c"
+DotEnv = "4dc1fcf4-5e3b-5448-94ab-0c38ec0385c1"
+EzXML = "8f5d6c58-4d21-5cfd-889c-e3ad7ee6a615"
+Gumbo = "708ec375-b3d6-5a57-a7ce-8257bf98657a"
+HDF5 = "f67ccb44-e63f-5c2f-98bd-6dc0ccc4ba2f"
+HTTP = "cd3eb016-35fb-5094-929b-558a96fad6f3"
+Inflate = "d25df0c9-e2be-5dd7-82c8-3ad0b3e990b9"
+PkgTemplates = "14b8a8f1-9102-5b29-a752-f990bacb7fe1"
+PromptingTools = "670122d1-24a8-4d70-bfce-740807c42192"
+Tar = "a4e569a6-e804-4fa4-b0f3-eef7a1d5b13e"
+URIParser = "30578b45-9adc-5946-b283-645ec420af67"
+URIs = "5c2747f8-b7ea-4ff2-ba2e-563bfd36b1d4"
+
+[compat]
+AbstractTrees = "0.4.5"
+Gumbo = "0.8.2"
+HTTP = "1.10.4"
+PromptingTools = "0.36.0"
+URIs = "1.5.1"
diff --git a/docs/make.jl b/docs/make.jl
new file mode 100644
index 0000000..38d4452
--- /dev/null
+++ b/docs/make.jl
@@ -0,0 +1,24 @@
+using Documenter: Documenter, makedocs, deploydocs
+using PkgTemplates: PkgTemplates
+
+makedocs(;
+    modules=[PkgTemplates],
+    authors="Shreyas Agrawal <48771895+splendidbug@users.noreply.github.com>",
+    repo="https://github.com/splendidbug/RAGKit",
+    sitename="RAGKit.jl",
+    # format=Documenter.HTML(;
+    #     repolink="https://github.com/splendidbug/RAGKit",
+    #     canonical="https://juliaci.github.io/PkgTemplates.jl",
+    #     assets=String[],
+    # ),
+    pages=[
+        "Home" => "index.md",
+        "User Guide" => "user.md",
+        "Developer Guide" => "developer.md",
+        "Migrating To PkgTemplates 0.7+" => "migrating.md",
+    ],
+)
+
+deploydocs(;
+    repo="https://github.com/splendidbug/RAGKit",
+)
diff --git a/docs/src/index.md b/docs/src/index.md
new file mode 100644
index 0000000..f53a411
--- /dev/null
+++ b/docs/src/index.md
@@ -0,0 +1,4 @@
+# RAGKit
+
+## Documentation
+
diff --git a/src/make_embeddings.jl b/src/make_embeddings.jl
index ba079aa..f51c865 100644
--- a/src/make_embeddings.jl
+++ b/src/make_embeddings.jl
@@ -66,6 +66,16 @@ function make_chunks(hostname_url_dict::Dict{AbstractString,Vector{AbstractStrin
 
 end
 
+function l2_norm_columns(mat::AbstractMatrix)
+    norm_ = norm.(eachcol(mat))
+    return mat ./ norm_'
+end
+function l2_norm_columns(vect::AbstractVector)
+    norm_ = norm(vect)
+    return vect / norm_
+end
+
+
 """
     generate_embeddings()
 
@@ -120,19 +130,19 @@ function generate_embeddings(knowledge_pack_path::String)
                 chunks = deserialize(chunks_file)
                 sources = deserialize(sources_file)
                 cost_tracker = Threads.Atomic{Float64}(0.0)
-                full_embeddings = RT.get_embeddings(embedder, chunks; model="text-embedding-3-large", verbose=false, cost_tracker, api_key=ENV["OPENAI_API_KEY"])
+                full_embeddings = RT.get_embeddings(embedder, chunks; model="text-embedding-3-large", verbose=false, cost_tracker, dimensions=1024)
 
-                # Float32
                 fn_output = joinpath(knowledge_pack_path, "packs", "$hostname-textembedding3large-0-Float32__v1.0.tar.gz")
                 fn_temp = joinpath(knowledge_pack_path, "packs", "pack.hdf5")
                 h5open(fn_temp, "w") do file
                     file["chunks"] = chunks
                     file["sources"] = sources
-                    file["embeddings"] = full_embeddings
+                    file["embeddings"] = full_embeddings[1:1024, :] |> l2_norm_columns |> x -> map(>(0), x)
                     file["type"] = "ChunkIndex"
                     # file["metadata"] = "$hostname ecosystem docstrings, chunk size $chunk_size, downloaded on 20240330, contains: Makie.jl, AlgebraOfGraphics.jl, GeoMakie.jl, GraphMakie.jl, MakieThemes.jl, TopoPlots.jl, Tyler.jl"
                 end
-                run(tar - cvzf$fn_output - C$(dirname(fn_temp))$(basename(fn_temp)))
+                command = `tar -cvzf $fn_output -C $(dirname(fn_temp)) $(basename(fn_temp))`
+                run(command)
                 report_artifact(fn_output)
 
             else
diff --git a/test/runtests.jl b/test/runtests.jl
index fdde81f..78a78b4 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -1,21 +1,37 @@
 
 using Test
+using HTTP, Gumbo, AbstractTrees, URIs
+using Gumbo: HTMLDocument, HTMLElement
+using EzXML
+using PromptingTools
+const PT = PromptingTools
+const RT = PromptingTools.Experimental.RAGTools
+using LinearAlgebra, Unicode, SparseArrays
+using HDF5
+using Tar
+using Inflate
+
+using SHA
+using Serialization, URIs
+
+include("..\\src\\crawl.jl")
+include("..\\src\\extract_urls.jl")
+include("..\\src\\parser.jl")
+include("..\\src\\preparation.jl")
 urls = Vector{AbstractString}(["https://docs.julialang.org/en/v1/"])
 url = urls[1]
 queue = Vector{AbstractString}()
 
 @testset "check robots.txt" begin
+    @test HTTP.get(url) != nothing
+
     result, sitemap_queue = check_robots_txt("*", url)
     @test result == true
 end
 
-@testset "HTTP get" begin
-    @test HTTP.get(url) != nothing
-end
-
-@testset "get_urls!" begin
-    get_urls!(url, queue)
-    @test length(queue) > 1
+@testset "crawl" begin
+    hostname_url_dict = crawl(urls)
+    @test length(hostname_url_dict) > 0
 end
 
 @testset "parse & roll_up" begin

From 6dc7aa3f7033959ae6730c250d8fba34a323c2fd Mon Sep 17 00:00:00 2001
From: Shreyas Agrawal <48771895+splendidbug@users.noreply.github.com>
Date: Sun, 11 Aug 2024 02:42:14 -0700
Subject: [PATCH 2/7] structured according to PkgTemplate, other changes

---
 .JuliaFormatter.toml               |   2 +
 .github/dependabot.yml             |   7 +
 .github/workflows/CI.yml           |  18 +--
 .github/workflows/CompatHelper.yml |  16 +++
 .github/workflows/TagBot.yml       |  31 ++++
 .gitignore                         |   5 +-
 .vscode/settings.json              |   6 +
 MIT => LICENSE                     |   2 +-
 Project.toml                       |  20 ++-
 docs/Project.toml                  |   1 -
 docs/make.jl                       |  23 ++-
 docs/src/index.md                  |   2 +-
 src/{RAGKit.jl => DocsScraper.jl}  |  15 +-
 src/crawl.jl                       |  32 ++---
 src/extract_urls.jl                |  95 ++++++------
 src/make_embeddings.jl             | 173 ----------------------
 src/make_knowledge_packs.jl        | 222 +++++++++++++++++++++++++++++
 src/parser.jl                      | 160 +++++++--------------
 src/preparation.jl                 |  58 +++++---
 src/user_preferences.jl            |   4 +
 src/utils.jl                       |  76 ++++++++--
 test/runtests.jl                   |  24 ++--
 22 files changed, 557 insertions(+), 435 deletions(-)
 create mode 100644 .JuliaFormatter.toml
 create mode 100644 .github/dependabot.yml
 create mode 100644 .github/workflows/CompatHelper.yml
 create mode 100644 .github/workflows/TagBot.yml
 create mode 100644 .vscode/settings.json
 rename MIT => LICENSE (94%)
 rename src/{RAGKit.jl => DocsScraper.jl} (56%)
 delete mode 100644 src/make_embeddings.jl
 create mode 100644 src/make_knowledge_packs.jl
 create mode 100644 src/user_preferences.jl

diff --git a/.JuliaFormatter.toml b/.JuliaFormatter.toml
new file mode 100644
index 0000000..5657bd0
--- /dev/null
+++ b/.JuliaFormatter.toml
@@ -0,0 +1,2 @@
+# See https://domluna.github.io/JuliaFormatter.jl/stable/ for a list of options
+style = "sciml"
diff --git a/.github/dependabot.yml b/.github/dependabot.yml
new file mode 100644
index 0000000..700707c
--- /dev/null
+++ b/.github/dependabot.yml
@@ -0,0 +1,7 @@
+# https://docs.github.com/github/administering-a-repository/configuration-options-for-dependency-updates
+version: 2
+updates:
+  - package-ecosystem: "github-actions"
+    directory: "/" # Location of package manifests
+    schedule:
+      interval: "weekly"
diff --git a/.github/workflows/CI.yml b/.github/workflows/CI.yml
index 371b418..874943f 100644
--- a/.github/workflows/CI.yml
+++ b/.github/workflows/CI.yml
@@ -3,7 +3,7 @@ on:
   push:
     branches:
       - main
-    tags: ['*']
+    tags: ["*"]
   pull_request:
   workflow_dispatch:
 concurrency:
@@ -23,9 +23,8 @@ jobs:
       fail-fast: false
       matrix:
         version:
-          <<#VERSIONS>>
-          - '<<&.>>'
-          <</VERSIONS>>
+          - "1.10"
+          - "nightly"
         os:
           - ubuntu-latest
         arch:
@@ -52,13 +51,11 @@ jobs:
       actions: write # needed to allow julia-actions/cache to proactively delete old caches that it has created
       contents: write
       statuses: write
-      pages: write
-      id-token: write
     steps:
       - uses: actions/checkout@v4
       - uses: julia-actions/setup-julia@v2
         with:
-          version: '1'
+          version: "1"
       - uses: julia-actions/cache@v2
       - name: Configure doc environment
         shell: julia --project=docs --color=yes {0}
@@ -75,7 +72,6 @@ jobs:
         shell: julia --project=docs --color=yes {0}
         run: |
           using Documenter: DocMeta, doctest
-          using <<&PKG>>
-          DocMeta.setdocmeta!(<<&PKG>>, :DocTestSetup, :(using <<&PKG>>); recursive=true)
-          doctest(<<&PKG>>)
-  <</HAS_DOCUMENTER>>
+          using DocsScraper
+          DocMeta.setdocmeta!(DocsScraper, :DocTestSetup, :(using DocsScraper); recursive=true)
+          doctest(DocsScraper)
diff --git a/.github/workflows/CompatHelper.yml b/.github/workflows/CompatHelper.yml
new file mode 100644
index 0000000..d48734a
--- /dev/null
+++ b/.github/workflows/CompatHelper.yml
@@ -0,0 +1,16 @@
+name: CompatHelper
+on:
+  schedule:
+    - cron: 0 0 1 * *
+  workflow_dispatch:
+jobs:
+  CompatHelper:
+    runs-on: ubuntu-latest
+    steps:
+      - name: Pkg.add("CompatHelper")
+        run: julia -e 'using Pkg; Pkg.add("CompatHelper")'
+      - name: CompatHelper.main()
+        env:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+          COMPATHELPER_PRIV: ${{ secrets.DOCUMENTER_KEY }}
+        run: julia -e 'using CompatHelper; CompatHelper.main()'
diff --git a/.github/workflows/TagBot.yml b/.github/workflows/TagBot.yml
new file mode 100644
index 0000000..0cd3114
--- /dev/null
+++ b/.github/workflows/TagBot.yml
@@ -0,0 +1,31 @@
+name: TagBot
+on:
+  issue_comment:
+    types:
+      - created
+  workflow_dispatch:
+    inputs:
+      lookback:
+        default: "3"
+permissions:
+  actions: read
+  checks: read
+  contents: write
+  deployments: read
+  issues: read
+  discussions: read
+  packages: read
+  pages: read
+  pull-requests: read
+  repository-projects: read
+  security-events: read
+  statuses: read
+jobs:
+  TagBot:
+    if: github.event_name == 'workflow_dispatch' || github.actor == 'JuliaTagBot'
+    runs-on: ubuntu-latest
+    steps:
+      - uses: JuliaRegistries/TagBot@v1
+        with:
+          token: ${{ secrets.GITHUB_TOKEN }}
+          ssh: ${{ secrets.DOCUMENTER_KEY }}
diff --git a/.gitignore b/.gitignore
index 9c929a1..8e2d4ba 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,4 +1,7 @@
 # Ignore .env files
 .env
 knowledge_packs/
-Manifest.toml
\ No newline at end of file
+Manifest.toml
+/Manifest.toml
+/docs/Manifest.toml
+/docs/build/
\ No newline at end of file
diff --git a/.vscode/settings.json b/.vscode/settings.json
new file mode 100644
index 0000000..9238ca7
--- /dev/null
+++ b/.vscode/settings.json
@@ -0,0 +1,6 @@
+{
+  "cSpell.words": [
+    "eachmatch",
+    "postprocess"
+  ]
+}
diff --git a/MIT b/LICENSE
similarity index 94%
rename from MIT
rename to LICENSE
index 775ba1d..d7bd022 100644
--- a/MIT
+++ b/LICENSE
@@ -1,6 +1,6 @@
 MIT License
 
-Copyright (c) {{{YEAR}}} {{{AUTHORS}}}
+Copyright (c) Shreyas Agrawal @splendidbug and J S @svilupp
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
diff --git a/Project.toml b/Project.toml
index 0c1f6a8..705a918 100644
--- a/Project.toml
+++ b/Project.toml
@@ -1,17 +1,15 @@
-name = "RAGKit"
-uuid = "74e640d8-05f4-4b4f-8742-56fc934b3f17"
-authors = ["Shreyas Agrawal <48771895+splendidbug@users.noreply.github.com>"]
+name = "DocsScraper"
+uuid = "bd71d052-5e08-40cc-a492-eb4e8da4b649"
+authors = ["Shreyas Agrawal @splendidbug and J S @svilupp"]
 version = "0.1.0"
 
 [deps]
 AbstractTrees = "1520ce14-60c1-5f80-bbc7-55ef81b5835c"
-DotEnv = "4dc1fcf4-5e3b-5448-94ab-0c38ec0385c1"
 EzXML = "8f5d6c58-4d21-5cfd-889c-e3ad7ee6a615"
 Gumbo = "708ec375-b3d6-5a57-a7ce-8257bf98657a"
 HDF5 = "f67ccb44-e63f-5c2f-98bd-6dc0ccc4ba2f"
 HTTP = "cd3eb016-35fb-5094-929b-558a96fad6f3"
 Inflate = "d25df0c9-e2be-5dd7-82c8-3ad0b3e990b9"
-PkgTemplates = "14b8a8f1-9102-5b29-a752-f990bacb7fe1"
 PromptingTools = "670122d1-24a8-4d70-bfce-740807c42192"
 Tar = "a4e569a6-e804-4fa4-b0f3-eef7a1d5b13e"
 URIParser = "30578b45-9adc-5946-b283-645ec420af67"
@@ -19,7 +17,19 @@ URIs = "5c2747f8-b7ea-4ff2-ba2e-563bfd36b1d4"
 
 [compat]
 AbstractTrees = "0.4.5"
+EzXML = "1.2.0"
 Gumbo = "0.8.2"
+HDF5 = "0.17.2"
 HTTP = "1.10.4"
+Inflate = "0.1.5"
 PromptingTools = "0.36.0"
+URIParser = "0.4.1"
 URIs = "1.5.1"
+Tar = "1.10.0"
+
+[extras]
+Aqua = "4c88cf16-eb10-579e-8560-4a9242c79595"
+Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
+
+[targets]
+test = ["Aqua", "Test"]
diff --git a/docs/Project.toml b/docs/Project.toml
index 6fea155..41b0b18 100644
--- a/docs/Project.toml
+++ b/docs/Project.toml
@@ -6,7 +6,6 @@ Gumbo = "708ec375-b3d6-5a57-a7ce-8257bf98657a"
 HDF5 = "f67ccb44-e63f-5c2f-98bd-6dc0ccc4ba2f"
 HTTP = "cd3eb016-35fb-5094-929b-558a96fad6f3"
 Inflate = "d25df0c9-e2be-5dd7-82c8-3ad0b3e990b9"
-PkgTemplates = "14b8a8f1-9102-5b29-a752-f990bacb7fe1"
 PromptingTools = "670122d1-24a8-4d70-bfce-740807c42192"
 Tar = "a4e569a6-e804-4fa4-b0f3-eef7a1d5b13e"
 URIParser = "30578b45-9adc-5946-b283-645ec420af67"
diff --git a/docs/make.jl b/docs/make.jl
index 38d4452..a54f0f6 100644
--- a/docs/make.jl
+++ b/docs/make.jl
@@ -1,24 +1,23 @@
-using Documenter: Documenter, makedocs, deploydocs
-using PkgTemplates: PkgTemplates
+using DocsScraper
+using Documenter
+
+DocMeta.setdocmeta!(DocsScraper, :DocTestSetup, :(using DocsScraper); recursive=true)
 
 makedocs(;
-    modules=[PkgTemplates],
-    authors="Shreyas Agrawal <48771895+splendidbug@users.noreply.github.com>",
-    repo="https://github.com/splendidbug/RAGKit",
-    sitename="RAGKit.jl",
+    modules=[DocsScraper],
+    authors="Shreyas Agrawal @splendidbug and J S @svilupp",
+    sitename="DocsScraper.jl",
     # format=Documenter.HTML(;
-    #     repolink="https://github.com/splendidbug/RAGKit",
-    #     canonical="https://juliaci.github.io/PkgTemplates.jl",
+    #     canonical="https://Shreyas Agrawal.github.io/DocsScraper.jl",
+    #     edit_link="master",
     #     assets=String[],
     # ),
     pages=[
         "Home" => "index.md",
-        "User Guide" => "user.md",
-        "Developer Guide" => "developer.md",
-        "Migrating To PkgTemplates 0.7+" => "migrating.md",
     ],
 )
 
 deploydocs(;
-    repo="https://github.com/splendidbug/RAGKit",
+    repo="github.com/Shreyas Agrawal/DocsScraper.jl",
+    devbranch="main",
 )
diff --git a/docs/src/index.md b/docs/src/index.md
index f53a411..a6f0129 100644
--- a/docs/src/index.md
+++ b/docs/src/index.md
@@ -1,4 +1,4 @@
-# RAGKit
+# DocsScraper
 
 ## Documentation
 
diff --git a/src/RAGKit.jl b/src/DocsScraper.jl
similarity index 56%
rename from src/RAGKit.jl
rename to src/DocsScraper.jl
index b895363..e78dde7 100644
--- a/src/RAGKit.jl
+++ b/src/DocsScraper.jl
@@ -1,7 +1,9 @@
-module RAGKit
+module DocsScraper
 using HTTP, Gumbo, AbstractTrees, URIs
 using Gumbo: HTMLDocument, HTMLElement
 using EzXML
+using Pkg
+Pkg.develop(PackageSpec(path="C:\\Users\\shrey\\Desktop\\stuff\\assignments\\grad\\projects\\Julia\\PromptingTools.jl"))
 using PromptingTools
 const PT = PromptingTools
 const RT = PromptingTools.Experimental.RAGTools
@@ -12,17 +14,18 @@ using Inflate
 
 using SHA
 using Serialization, URIs
-# using Regex
-
-# using Robots
 
 include("parser.jl")
 include("crawl.jl")
 include("extract_urls.jl")
 include("preparation.jl")
 
-include("make_embeddings.jl")
-export make_embeddings
+include("make_knowledge_packs.jl")
+export make_knowledge_packs, just_generate
+
+include("user_preferences.jl")
+include("utils.jl")
+export remove_urls_from_index
 
 
 end
\ No newline at end of file
diff --git a/src/crawl.jl b/src/crawl.jl
index b147511..a8f93c9 100644
--- a/src/crawl.jl
+++ b/src/crawl.jl
@@ -2,10 +2,7 @@
 """
     parse_robots_txt!(robots_txt::String)
 
-Parses the robots.txt string and returns rules along with the URLs on Sitemap
-
-# Arguments
-- `robots_txt`: robots.txt as a string
+Parse the robots.txt string and return rules and the URLs on Sitemap
 """
 function parse_robots_txt!(robots_txt::String)
     rules = Dict{String,Dict{String,Vector{String}}}()
@@ -40,17 +37,15 @@ end
 
 
 """
-    check_robots_txt(user_agent::AbstractString,
-        url::AbstractString)
+    check_robots_txt(user_agent::AbstractString, url::AbstractString)
 
-Checks the robots.txt of a URL and returns a boolean representing if `user_agent` is allowed to crawl the input url
+Check robots.txt of a URL and return a boolean representing if `user_agent` is allowed to crawl the input url, along with sitemap urls
 
 # Arguments
 - `user_agent`: user agent attempting to crawl the webpage
 - `url`: input URL string
 """
-function check_robots_txt(user_agent::AbstractString,
-    url::AbstractString)
+function check_robots_txt(user_agent::AbstractString, url::AbstractString)
 
     ## TODO: Make a cache of rules for a quick lookup
     # if (haskey(restricted_urls, url))
@@ -101,10 +96,7 @@ end
 """
     get_base_url(url::AbstractString)
 
-Extracts the base url.
-
-# Arguments
-- `url`: The url string of which, the base url needs to be extracted
+Extract the base url
 """
 function get_base_url(url::AbstractString)
 
@@ -118,10 +110,7 @@ end
 """
     process_hostname(url::AbstractString)
 
-Returns the hostname of an input URL
-
-# Arguments
-- `url`: URL string
+Return the hostname of an input URL
 """
 function process_hostname(url::AbstractString)
     URI = URIs.URI(url)
@@ -133,7 +122,7 @@ end
 """
     process_hostname(url::AbstractString, hostname_dict::Dict{AbstractString,Vector{AbstractString}})
 
-Adds the `url` to it's hostname in `hostname_dict`
+Add `url` to its hostname in `hostname_dict`
 
 # Arguments
 - `url`: URL string
@@ -154,10 +143,7 @@ end
 """
     crawl(input_urls::Vector{<:AbstractString})
 
-Crawls on the input URLs and returns a `hostname_url_dict` which is a dictionary with key being hostnames and the values being the URLs
-
-# Arguments
-- `input_urls`: A vector of input URLs
+Crawl on the input URLs and return a `hostname_url_dict` which is a dictionary with key being hostnames and the values being the URLs
 """
 function crawl(input_urls::Vector{<:AbstractString})
 
@@ -187,6 +173,6 @@ function crawl(input_urls::Vector{<:AbstractString})
         end
     end
 
-    return hostname_url_dict
+    return hostname_url_dict, visited_url_set
 
 end
diff --git a/src/extract_urls.jl b/src/extract_urls.jl
index b9ea364..d5e8fcf 100644
--- a/src/extract_urls.jl
+++ b/src/extract_urls.jl
@@ -1,31 +1,37 @@
-# Temporary until I find a package to simplify this
+"""
+    resolve_url(base_url::String, extracted_url::String)
 
-function resolve_url(base_url::String, relative_url::String)::String
-    base_uri = URI(base_url)
-    relative_uri = URI(relative_url)
+Check the extracted URL with the original URL. Return empty String if the extracted URL belongs to a different domain. 
+Return complete URL if there's a directory traversal paths or the extracted URL belongs to the same domain as the base_url
 
-    ## TODO: Make a list of allowed URLs which would contain Julia docs hostnames
+# Arguments
+- base_url: URL of the page from which other URLs are being extracted
+- extracted_url: URL extracted from the base_url  
+"""
+function resolve_url(base_url::String, extracted_url::String)
+    base_uri = URI(base_url)
+    extracted_uri = URI(extracted_url)
     ## TODO: Look for version number either on the bottom left dropdown or identify on the url
 
-    if length(relative_url) > 4 && relative_url[1:4] == "http"
-        if base_uri.host == relative_uri.host
-            return relative_url
+    if length(extracted_url) > 4 && extracted_url[1:4] == "http"
+        if base_uri.host == extracted_uri.host
+            return extracted_url
         end
         return ""
     end
-    if !isempty(relative_url) && relative_url[1] == '#'
+    if !isempty(extracted_url) && extracted_url[1] == '#'
         return ""
     end
 
-    if !isempty(relative_uri.path) && relative_uri.path[1] == '/'
+    if !isempty(extracted_uri.path) && extracted_uri.path[1] == '/'
         resolved_uri = URI(
-            scheme=base_uri.scheme,
-            userinfo=base_uri.userinfo,
-            host=base_uri.host,
-            port=base_uri.port,
-            path=relative_uri.path,
-            query=relative_uri.query,
-            fragment=relative_uri.fragment
+            scheme = base_uri.scheme,
+            userinfo = base_uri.userinfo,
+            host = base_uri.host,
+            port = base_uri.port,
+            path = extracted_uri.path,
+            query = extracted_uri.query,
+            fragment = extracted_uri.fragment
         )
         return string(resolved_uri)
     end
@@ -34,11 +40,11 @@ function resolve_url(base_url::String, relative_url::String)::String
     base_segments = split(base_uri.path, "/")
     base_segments = filter((i) -> i != "", base_segments)
 
-    relative_segments = split(relative_uri.path, "/")
-    relative_segments = filter((i) -> i != "", relative_segments)
+    extracted_segments = split(extracted_uri.path, "/")
+    extracted_segments = filter((i) -> i != "", extracted_segments)
 
-    # Process the relative segments
-    for segment in relative_segments
+    # Process the directory traversal paths
+    for segment in extracted_segments
         if segment == ".."
             if !isempty(base_segments)
                 pop!(base_segments)
@@ -53,31 +59,29 @@ function resolve_url(base_url::String, relative_url::String)::String
 
     # Create the resolved URI
     resolved_uri = URI(
-        scheme=base_uri.scheme,
-        userinfo=base_uri.userinfo,
-        host=base_uri.host,
-        port=base_uri.port,
-        path=resolved_path,
-        query=relative_uri.query,
-        fragment=relative_uri.fragment
+        scheme = base_uri.scheme,
+        userinfo = base_uri.userinfo,
+        host = base_uri.host,
+        port = base_uri.port,
+        path = resolved_path,
+        query = extracted_uri.query,
+        fragment = extracted_uri.fragment
     )
     return string(resolved_uri)
 end
 
-
 """
-    find_urls!(url::AbstractString, 
-        node::Gumbo.HTMLElement, 
-        url_queue::Vector{<:AbstractString}
+    find_urls_html!(url::AbstractString, node::Gumbo.HTMLElement, url_queue::Vector{<:AbstractString}
 
-Function to recursively find <a> and extract the urls
+Function to recursively find <a> tags and extract the urls
 
 # Arguments
 - url: The initial input URL 
 - node: The HTML node of type Gumbo.HTMLElement
 - url_queue: Vector in which extracted URLs will be appended
 """
-function find_urls_html!(url::AbstractString, node::Gumbo.HTMLElement, url_queue::Vector{<:AbstractString})
+function find_urls_html!(
+        url::AbstractString, node::Gumbo.HTMLElement, url_queue::Vector{<:AbstractString})
     if Gumbo.tag(node) == :a && haskey(node.attributes, "href")
         href = node.attributes["href"]
         if href !== nothing && !isempty(resolve_url(url, href))
@@ -85,6 +89,7 @@ function find_urls_html!(url::AbstractString, node::Gumbo.HTMLElement, url_queue
         end
     end
 
+    # Go deep in the HTML tags and check if `node` is an <a> tag
     for child in node.children
         if isa(child, HTMLElement)
             find_urls_html!(url, child, url_queue)
@@ -92,9 +97,18 @@ function find_urls_html!(url::AbstractString, node::Gumbo.HTMLElement, url_queue
     end
 end
 
+"""
+    find_urls_xml!(url::AbstractString, url_queue::Vector{<:AbstractString})
 
+Identify URL through regex pattern in xml files and push in `url_queue`
 
+# Arguments
+- url: url from which all other URLs will be extracted
+- url_queue: Vector in which extracted URLs will be appended
+"""
 function find_urls_xml!(url::AbstractString, url_queue::Vector{<:AbstractString})
+    # If a string starts with "http" then it is considered as a URL regardless of it being valid. 
+    # Validity of URLs are checked during HTTP fetch
     try
         fetched_content = HTTP.get(url)
         xml_content = String(fetched_content.body)
@@ -108,32 +122,23 @@ function find_urls_xml!(url::AbstractString, url_queue::Vector{<:AbstractString}
     end
 end
 
-
-
 """
     get_links!(url::AbstractString, 
         url_queue::Vector{<:AbstractString})
 
-Function to extract urls inside <a> tags
+Extract urls inside html or xml files 
 
 # Arguments
 - url: url from which all other URLs will be extracted
 - url_queue: Vector in which extracted URLs will be appended
 """
 function get_urls!(url::AbstractString, url_queue::Vector{<:AbstractString})
-
     @info "Scraping link: $url"
-    # println(url)
-    # try
     fetched_content = HTTP.get(url)
     parsed = Gumbo.parsehtml(String(fetched_content.body))
-    if (url[end-3:end] == ".xml")
+    if (url[(end - 3):end] == ".xml")
         find_urls_xml!(url_xml, url_queue)
     else
         find_urls_html!(url, parsed.root, url_queue)
     end
-    # print("-------------")
-    # catch e
-    #     println("Bad URL: $url")
-    # end
 end
\ No newline at end of file
diff --git a/src/make_embeddings.jl b/src/make_embeddings.jl
deleted file mode 100644
index f51c865..0000000
--- a/src/make_embeddings.jl
+++ /dev/null
@@ -1,173 +0,0 @@
-## TODO: Make a function to Check for version number
-
-"""
-    report_artifact()
-
-prints artifact information
-"""
-function report_artifact(fn_output)
-    @info("ARTIFACT: $(basename(fn_output))")
-    @info("sha256: ", bytes2hex(open(sha256, fn_output)))
-    @info("git-tree-sha1: ", Tar.tree_hash(IOBuffer(inflate_gzip(fn_output))))
-end
-
-
-
-
-"""
-    create_output_folders()
-
-Creates output folders
-"""
-function create_output_folders(knowledge_pack_path::String)
-    # Define the folder path    
-    folder_path = joinpath(knowledge_pack_path, "packs")
-    println("folder_path:", folder_path)
-    # Check if the folder exists
-    if !isdir(folder_path)
-        mkpath(folder_path)
-        @info "Folder created: $folder_path"
-    else
-        @info "Folder already exists: $folder_path"
-    end
-
-end
-
-"""
-    make_chunks(hostname_url_dict::Dict{AbstractString,Vector{AbstractString}})
-
-Parses URLs from hostname_url_dict and saves the chunks
-
-# Arguments
-- hostname_url_dict: Dict with key being hostname and value being a vector of URLs
-"""
-function make_chunks(hostname_url_dict::Dict{AbstractString,Vector{AbstractString}}, knowledge_pack_path::String)
-    output_chunks = Vector{SubString{String}}()
-    output_sources = Vector{String}()
-    SAVE_CHUNKS = true
-    CHUNK_SIZE = 512
-    for (hostname, urls) in hostname_url_dict
-        for url in urls
-            try
-                chunks, sources = process_paths(url)
-                append!(output_chunks, chunks)
-                append!(output_sources, sources)
-            catch
-                @error "error!! check url: $url"
-            end
-        end
-        if SAVE_CHUNKS
-            serialize(joinpath(knowledge_pack_path, "$(hostname)-chunks-$(CHUNK_SIZE).jls"), output_chunks)
-            serialize(joinpath(knowledge_pack_path, "$(hostname)-sources-$(CHUNK_SIZE).jls"), output_sources)
-        end
-
-    end
-
-
-end
-
-function l2_norm_columns(mat::AbstractMatrix)
-    norm_ = norm.(eachcol(mat))
-    return mat ./ norm_'
-end
-function l2_norm_columns(vect::AbstractVector)
-    norm_ = norm(vect)
-    return vect / norm_
-end
-
-
-"""
-    generate_embeddings()
-
-Deserializes chunks and sources to generate embeddings 
-"""
-function generate_embeddings(knowledge_pack_path::String)
-    embedder = RT.BatchEmbedder()
-    entries = readdir(knowledge_pack_path)
-
-    # Initialize a dictionary to group files by hostname and chunk size
-    hostname_files = Dict{String,Dict{Int,Dict{String,String}}}()
-
-    # Regular expressions to match the file patterns
-    chunks_pattern = r"^(.*)-chunks-(\d+)\.jls$"
-    sources_pattern = r"^(.*)-sources-(\d+)\.jls$"
-
-    # Group files by hostname and chunk size
-    for file in entries
-        match_chunks = match(chunks_pattern, file)
-        match_sources = match(sources_pattern, file)
-
-        if match_chunks !== nothing
-            hostname = match_chunks.captures[1]
-            chunk_size = parse(Int, match_chunks.captures[2])
-            if !haskey(hostname_files, hostname)
-                hostname_files[hostname] = Dict{Int,Dict{String,String}}()
-            end
-            if !haskey(hostname_files[hostname], chunk_size)
-                hostname_files[hostname][chunk_size] = Dict{String,String}()
-            end
-            hostname_files[hostname][chunk_size]["chunks"] = joinpath(knowledge_pack_path, file)
-        elseif match_sources !== nothing
-            hostname = match_sources.captures[1]
-            chunk_size = parse(Int, match_sources.captures[2])
-            if !haskey(hostname_files, hostname)
-                hostname_files[hostname] = Dict{Int,Dict{String,String}}()
-            end
-            if !haskey(hostname_files[hostname], chunk_size)
-                hostname_files[hostname][chunk_size] = Dict{String,String}()
-            end
-            hostname_files[hostname][chunk_size]["sources"] = joinpath(knowledge_pack_path, file)
-        end
-    end
-
-
-    # Process each pair of files
-    for (hostname, chunk_files) in hostname_files
-        for (chunk_size, files) in chunk_files
-            if haskey(files, "chunks") && haskey(files, "sources")
-                chunks_file = files["chunks"]
-                sources_file = files["sources"]
-                chunks = deserialize(chunks_file)
-                sources = deserialize(sources_file)
-                cost_tracker = Threads.Atomic{Float64}(0.0)
-                full_embeddings = RT.get_embeddings(embedder, chunks; model="text-embedding-3-large", verbose=false, cost_tracker, dimensions=1024)
-
-                fn_output = joinpath(knowledge_pack_path, "packs", "$hostname-textembedding3large-0-Float32__v1.0.tar.gz")
-                fn_temp = joinpath(knowledge_pack_path, "packs", "pack.hdf5")
-                h5open(fn_temp, "w") do file
-                    file["chunks"] = chunks
-                    file["sources"] = sources
-                    file["embeddings"] = full_embeddings[1:1024, :] |> l2_norm_columns |> x -> map(>(0), x)
-                    file["type"] = "ChunkIndex"
-                    # file["metadata"] = "$hostname ecosystem docstrings, chunk size $chunk_size, downloaded on 20240330, contains: Makie.jl, AlgebraOfGraphics.jl, GeoMakie.jl, GraphMakie.jl, MakieThemes.jl, TopoPlots.jl, Tyler.jl"
-                end
-                command = `tar -cvzf $fn_output -C $(dirname(fn_temp)) $(basename(fn_temp))`
-                run(command)
-                report_artifact(fn_output)
-
-            else
-                @warn "Missing pair for hostname: $hostname, chunk size: $chunk_size"
-            end
-        end
-    end
-
-end
-
-
-
-"""
-    make_embeddings(input_urls::Vector{<:AbstractString})
-
-Entry point to crawl, parse and create embeddings
-
-# Arguments
-- input_urls: vector containing URL strings to parse
-"""
-function make_embeddings(input_urls::Vector{<:AbstractString})
-    hostname_url_dict = Dict{AbstractString,Vector{AbstractString}}()
-    hostname_url_dict = crawl(input_urls)
-    knowledge_pack_path = joinpath(@__DIR__, "..", "knowledge_packs")
-    create_output_folders(knowledge_pack_path)
-    make_chunks(hostname_url_dict, knowledge_pack_path)
-    generate_embeddings(knowledge_pack_path)
-end
\ No newline at end of file
diff --git a/src/make_knowledge_packs.jl b/src/make_knowledge_packs.jl
new file mode 100644
index 0000000..291a9c7
--- /dev/null
+++ b/src/make_knowledge_packs.jl
@@ -0,0 +1,222 @@
+"""
+    report_artifact(fn_output)
+
+Print artifact information
+"""
+function report_artifact(fn_output)
+    @info("ARTIFACT: $(basename(fn_output))")
+    @info("sha256: ", bytes2hex(open(sha256, fn_output)))
+    @info("git-tree-sha1: ", Tar.tree_hash(IOBuffer(inflate_gzip(fn_output))))
+end
+
+"""
+    create_output_folders(knowledge_pack_path::String)
+
+Create output folders on the knowledge_pack_path
+"""
+function create_output_folders(knowledge_pack_path::String)
+    # Define the folder path    
+    folder_path = joinpath(knowledge_pack_path, "packs")
+    # Check if the folder exists
+    if !isdir(folder_path)
+        mkpath(folder_path)
+    end
+end
+
+"""
+    make_chunks(hostname_url_dict::Dict{AbstractString,Vector{AbstractString}}, knowledge_pack_path::String; max_chunk_size::Int=MAX_CHUNK_SIZE,
+        min_chunk_size::Int=MIN_CHUNK_SIZE)
+
+Parse URLs from hostname_url_dict and save the chunks
+
+# Arguments
+- hostname_url_dict: Dict with key being hostname and value being a vector of URLs
+- knowledge_pack_path: Knowledge pack path
+- max_chunk_size: Maximum chunk size
+- min_chunk_size: Minimum chunk size
+"""
+function make_chunks(hostname_url_dict::Dict{AbstractString, Vector{AbstractString}},
+        knowledge_pack_path::String; max_chunk_size::Int = MAX_CHUNK_SIZE,
+        min_chunk_size::Int = MIN_CHUNK_SIZE)
+    SAVE_CHUNKS = true
+    for (hostname, urls) in hostname_url_dict
+        output_chunks = Vector{SubString{String}}()
+        output_sources = Vector{String}()
+        for url in urls
+            try
+                chunks, sources = process_paths(url; max_chunk_size, min_chunk_size)
+                append!(output_chunks, chunks)
+                append!(output_sources, sources)
+            catch
+                @error "error!! check url: $url"
+            end
+        end
+        if SAVE_CHUNKS
+            serialize(
+                joinpath(knowledge_pack_path,
+                    "$(hostname)-chunks-max-$(max_chunk_size)-min-$(min_chunk_size).jls"),
+                output_chunks)
+            serialize(
+                joinpath(knowledge_pack_path,
+                    "$(hostname)-sources-max-$(max_chunk_size)-min-$(min_chunk_size).jls"),
+                output_sources)
+        end
+    end
+end
+
+"""
+    l2_norm_columns(mat::AbstractMatrix)
+
+Normalize the columns of the input embeddings
+"""
+function l2_norm_columns(mat::AbstractMatrix)
+    norm_ = norm.(eachcol(mat))
+    return mat ./ norm_'
+end
+
+"""
+    l2_norm_columns(vect::AbstractVector)
+
+Normalize the columns of the input embeddings
+"""
+function l2_norm_columns(vect::AbstractVector)
+    norm_ = norm(vect)
+    return vect / norm_
+end
+
+"""
+    generate_embeddings(knowledge_pack_path::String; model::AbstractString=MODEL, embedding_size::Int=EMBEDDING_SIZE)
+
+Deserialize chunks and sources to generate embeddings 
+
+# Arguments
+- model: Embedding model
+- embedding_size: Embedding dimensions
+"""
+function generate_embeddings(knowledge_pack_path::String; model::AbstractString = MODEL,
+        embedding_size::Int = EMBEDDING_SIZE)
+    embedder = RT.BatchEmbedder()
+    entries = readdir(knowledge_pack_path)
+    # Initialize a dictionary to group files by hostname and chunk size
+    hostname_files = Dict{String, Dict{Int, Dict{String, String}}}()
+
+    # Regular expressions to match the file patterns of chunks and sources
+    chunks_pattern = r"^(.*)-chunks-max-(\d+)-min-(\d+)\.jls$"
+    sources_pattern = r"^(.*)-sources-max-(\d+)-min-(\d+)\.jls$"
+
+    # chunks_pattern = r"^(.*)-chunks-(\d+)\.jls$"
+    # sources_pattern = r"^(.*)-sources-(\d+)\.jls$"
+
+    # Group files by hostname and chunk size
+    for file in entries
+        match_chunks = match(chunks_pattern, file)
+        match_sources = match(sources_pattern, file)
+
+        if match_chunks !== nothing
+            hostname = match_chunks.captures[1]
+            chunk_size = parse(Int, match_chunks.captures[2])
+            if !haskey(hostname_files, hostname)
+                hostname_files[hostname] = Dict{Int, Dict{String, String}}()
+            end
+            if !haskey(hostname_files[hostname], chunk_size)
+                hostname_files[hostname][chunk_size] = Dict{String, String}()
+            end
+            hostname_files[hostname][chunk_size]["chunks"] = joinpath(
+                knowledge_pack_path, file)
+        elseif match_sources !== nothing
+            hostname = match_sources.captures[1]
+            chunk_size = parse(Int, match_sources.captures[2])
+            if !haskey(hostname_files, hostname)
+                hostname_files[hostname] = Dict{Int, Dict{String, String}}()
+            end
+            if !haskey(hostname_files[hostname], chunk_size)
+                hostname_files[hostname][chunk_size] = Dict{String, String}()
+            end
+            hostname_files[hostname][chunk_size]["sources"] = joinpath(
+                knowledge_pack_path, file)
+        end
+    end
+    # Process each pair of files
+    for (hostname, chunk_files) in hostname_files
+        for (chunk_size, files) in chunk_files
+            if haskey(files, "chunks") && haskey(files, "sources")
+                chunks_file = files["chunks"]
+                sources_file = files["sources"]
+                chunks = deserialize(chunks_file)
+                sources = deserialize(sources_file)
+                cost_tracker = Threads.Atomic{Float64}(0.0)
+                full_embeddings = RT.get_embeddings(
+                    embedder, chunks; model, verbose = false, cost_tracker)
+                @info "Created embeddings for $hostname. Cost: \$$(round(cost_tracker[], digits=3))"
+                fn_output = joinpath(knowledge_pack_path, "packs",
+                    "$hostname-textembedding3large-0-Float32__v1.0.tar.gz")
+                fn_temp = joinpath(knowledge_pack_path, "packs",
+                    "$hostname-textembedding3large-0-Float32__v1.0.hdf5")
+                h5open(fn_temp, "w") do file
+                    file["chunks"] = chunks
+                    file["sources"] = sources
+                    file["embeddings"] = full_embeddings[1:embedding_size, :] |>
+                                         l2_norm_columns |> x -> map(>(0), x)
+                    file["type"] = "ChunkIndex"
+                    # file["metadata"] = "$hostname ecosystem docstrings, chunk size $chunk_size, downloaded on 20240330, contains: Makie.jl, AlgebraOfGraphics.jl, GeoMakie.jl, GraphMakie.jl, MakieThemes.jl, TopoPlots.jl, Tyler.jl"
+                end
+
+                command = `tar -cvzf $fn_output -C $(dirname(fn_temp)) $(basename(fn_temp))`
+                run(command)
+                report_artifact(fn_output)
+
+            else
+                @warn "Missing pair for hostname: $hostname, chunk size: $chunk_size"
+            end
+        end
+    end
+end
+
+"""
+    make_knowledge_packs(crawlable_urls::Vector{<:AbstractString}=String[]; single_urls::Vector{<:AbstractString}=String[],
+        max_chunk_size::Int=MAX_CHUNK_SIZE, min_chunk_size::Int=MIN_CHUNK_SIZE, model::AbstractString=MODEL, embedding_size::Int=EMBEDDING_SIZE)
+
+Entry point to crawl, parse and generate embeddings
+
+# Arguments
+- crawlable_urls: URLs that should be crawled to find more links
+- single_urls: Single page URLs that should just be scraped and parsed. The crawler won't look for more URLs
+- max_chunk_size: Maximum chunk size
+- min_chunk_size: Minimum chunk size
+- model: Embedding model
+- embedding_size: Embedding dimensions
+"""
+function make_knowledge_packs(crawlable_urls::Vector{<:AbstractString} = String[];
+        single_urls::Vector{<:AbstractString} = String[],
+        max_chunk_size::Int = MAX_CHUNK_SIZE, min_chunk_size::Int = MIN_CHUNK_SIZE,
+        model::AbstractString = MODEL, embedding_size::Int = EMBEDDING_SIZE)
+    if isempty(crawlable_urls) && isempty(single_urls)
+        error("At least one of `input_urls` or `single_pages` must be provided.")
+    end
+
+    hostname_url_dict = Dict{AbstractString, Vector{AbstractString}}()
+
+    if !isempty(crawlable_urls)
+        hostname_url_dict, visited_url_set = crawl(crawlable_urls)
+    else
+        visited_url_set = Set{AbstractString}()
+    end
+    for url in single_urls
+        base_url = get_base_url(url)
+        if !in(base_url, visited_url_set)
+            push!(visited_url_set, base_url)
+            crawlable, sitemap_urls = check_robots_txt("*", base_url)
+            if crawlable
+                try
+                    process_hostname!(url, hostname_url_dict)
+                catch
+                    @error "Bad URL: $base_url"
+                end
+            end
+        end
+    end
+    knowledge_pack_path = joinpath(@__DIR__, "..", "knowledge_packs")
+    create_output_folders(knowledge_pack_path)
+    make_chunks(hostname_url_dict, knowledge_pack_path; max_chunk_size, min_chunk_size)
+    generate_embeddings(knowledge_pack_path; model, embedding_size)
+end
diff --git a/src/parser.jl b/src/parser.jl
index d909280..def1a17 100644
--- a/src/parser.jl
+++ b/src/parser.jl
@@ -1,21 +1,3 @@
-"""
-Working:
-
-Since HTML structure is complex, we need to figure out when do we insert the extracted text in parsed_blocks 
-ie., should we add the text of child hierarchy and then insert or should we insert now and let the child hierarchy make another insertion.  
-For this we employ multiple checks. If the current node is heading, directly insert into parsed_blocks.
-If the current node is a code block, return the text inside code block with backticks.
-If the node is neither heading nor code, then we'll need to go deeper in the hierarchy. 
-if the current node's tag is from the list [:p, :li, :dt, :dd, :pre, :b, :strong, :i, :cite, :address, :em, :td]
-it is assumed that everything inside the tag is part of a single text block with inline code. 
-But when we go deeper and if there is a code block with size > 50 chars, then our assumption was false. 
-To correct this, we first insert the previously extracted text, next we insert the current code and additionally indicate the parent recursion iteration 
-that the current iteration has inserted the previously parsed text, so there is no need for parent iteration to insert the text block again. 
-We indicate this by a return flag is_text_inserted
-"""
-
-
-
 """
     insert_parsed_data!(heading_hierarchy::Dict{Symbol,Any}, 
         parsed_blocks::Vector{Dict{String,Any}}, 
@@ -30,11 +12,10 @@ Insert the text into parsed_blocks Vector
 - text_to_insert: Text to be inserted
 - text_type: The text to be inserted could be heading or a code block or just text
 """
-function insert_parsed_data!(heading_hierarchy::Dict{Symbol,Any},
-    parsed_blocks::Vector{Dict{String,Any}},
-    text_to_insert::AbstractString,
-    text_type::AbstractString)
-
+function insert_parsed_data!(heading_hierarchy::Dict{Symbol, Any},
+        parsed_blocks::Vector{Dict{String, Any}},
+        text_to_insert::AbstractString,
+        text_type::AbstractString)
     if !isempty(strip(text_to_insert))
         push!(parsed_blocks,
             Dict(text_type => strip(text_to_insert),
@@ -42,8 +23,6 @@ function insert_parsed_data!(heading_hierarchy::Dict{Symbol,Any},
     end
 end
 
-
-
 """
     process_headings!(node::Gumbo.HTMLElement,
         heading_hierarchy::Dict{Symbol,Any},
@@ -57,13 +36,13 @@ Process headings. If the current node is heading, directly insert into parsed_bl
 - parsed_blocks: Vector of Dicts to store parsed text and metadata
 """
 function process_headings!(node::Gumbo.HTMLElement,
-    heading_hierarchy::Dict{Symbol,Any},
-    parsed_blocks::Vector{Dict{String,Any}})
-
+        heading_hierarchy::Dict{Symbol, Any},
+        parsed_blocks::Vector{Dict{String, Any}})
     tag_name = Gumbo.tag(node)
     # Clear headings of equal or lower level
     for k in collect(keys(heading_hierarchy))
-        if k != "header" && Base.parse(Int, last(string(k))) >= Base.parse(Int, last(string(tag_name)))
+        if k != "header" &&
+           Base.parse(Int, last(string(k))) >= Base.parse(Int, last(string(tag_name)))
             delete!(heading_hierarchy, k)
         end
     end
@@ -123,11 +102,10 @@ If the node is neither heading nor code
 - prev_text_buffer: IO Buffer which contains previous text
 """
 function process_generic_node!(node::Gumbo.HTMLElement,
-    heading_hierarchy::Dict{Symbol,Any},
-    parsed_blocks::Vector{Dict{String,Any}},
-    child_new::Bool=true,
-    prev_text_buffer::IO=IOBuffer(write=true))
-
+        heading_hierarchy::Dict{Symbol, Any},
+        parsed_blocks::Vector{Dict{String, Any}},
+        child_new::Bool = true,
+        prev_text_buffer::IO = IOBuffer(write = true))
     seekstart(prev_text_buffer)
     prev_text = read(prev_text_buffer, String)
 
@@ -142,10 +120,15 @@ function process_generic_node!(node::Gumbo.HTMLElement,
         # if the current tag belongs in the list, it is assumed that all the text/code should be part of a single paragraph/block, unless,
         # there occurs a code block with >50 chars, then, previously parsed text is inserted first, then the code block is inserted. 
 
-        if tag_name in [:p, :li, :dt, :dd, :pre, :b, :strong, :i, :cite, :address, :em, :td, :a, :span, :header]
-            received_text, is_code_block, is_text_inserted = process_node!(child, heading_hierarchy, parsed_blocks, false, prev_text_buffer)
+        if tag_name in [:p, :li, :dt, :dd, :pre, :b, :strong, :i,
+            :cite, :address, :em, :td, :a, :span, :header]
+            received_text, is_code_block, is_text_inserted = process_node!(
+                child, heading_hierarchy, parsed_blocks, false, prev_text_buffer)
+        elseif tag_name in [:script]
+            continue
         else
-            received_text, is_code_block, is_text_inserted = process_node!(child, heading_hierarchy, parsed_blocks, child_new, prev_text_buffer)
+            received_text, is_code_block, is_text_inserted = process_node!(
+                child, heading_hierarchy, parsed_blocks, child_new, prev_text_buffer)
         end
 
         # changing text_to_insert to "" to avoid inserting text_to_insert again (as it was inserted by the child recursion call)
@@ -180,7 +163,6 @@ function process_generic_node!(node::Gumbo.HTMLElement,
             print(prev_text_buffer, " " * received_text)
             text_to_insert = text_to_insert * " " * received_text
         end
-
     end
 
     # if child_new is false, this means new child (new entry in parsed_blocks) should not be created, hence, 
@@ -195,7 +177,8 @@ function process_generic_node!(node::Gumbo.HTMLElement,
     # if we're insert text in current node level, then we should insert the previous text if available, 
     # otherwise it'll be inserted when the control goes back to the parent call and hence, order of the insertion will be weird
     if !isempty(strip(text_to_insert))
-        insert_parsed_data!(heading_hierarchy, parsed_blocks, String(take!(prev_text_buffer)), "text")
+        insert_parsed_data!(
+            heading_hierarchy, parsed_blocks, String(take!(prev_text_buffer)), "text")
         is_text_inserted = true
     end
 
@@ -205,7 +188,6 @@ function process_generic_node!(node::Gumbo.HTMLElement,
     return "", is_code_block, is_text_inserted
 end
 
-
 """
     process_docstring!(node::Gumbo.HTMLElement,
         heading_hierarchy::Dict{Symbol,Any},
@@ -224,11 +206,10 @@ Function to process node of class `docstring`
 - prev_text_buffer: IO Buffer which contains previous text
 """
 function process_docstring!(node::Gumbo.HTMLElement,
-    heading_hierarchy::Dict{Symbol,Any},
-    parsed_blocks::Vector{Dict{String,Any}},
-    child_new::Bool=true,
-    prev_text_buffer::IO=IOBuffer(write=true))
-
+        heading_hierarchy::Dict{Symbol, Any},
+        parsed_blocks::Vector{Dict{String, Any}},
+        child_new::Bool = true,
+        prev_text_buffer::IO = IOBuffer(write = true))
     seekstart(prev_text_buffer)
     prev_text = read(prev_text_buffer, String)
     is_code_block = false
@@ -248,10 +229,12 @@ function process_docstring!(node::Gumbo.HTMLElement,
     # Insert "header"
     if Gumbo.tag(children[1]) == :header
         heading_hierarchy[:docstring_header] = strip(Gumbo.text(children[1]))
-        insert_parsed_data!(heading_hierarchy, parsed_blocks, Gumbo.text(children[1]), "docstring_header")
+        insert_parsed_data!(
+            heading_hierarchy, parsed_blocks, Gumbo.text(children[1]), "docstring_header")
     end
 
-    received_text, is_code_block, is_text_inserted = process_node!(children[2], heading_hierarchy, parsed_blocks, child_new, prev_text_buffer)
+    received_text, is_code_block, is_text_inserted = process_node!(
+        children[2], heading_hierarchy, parsed_blocks, child_new, prev_text_buffer)
 
     if !isempty(strip(received_text))
         insert_parsed_data!(heading_hierarchy, parsed_blocks, received_text, "text")
@@ -279,11 +262,10 @@ Function to process a node
 - prev_text_buffer: IO Buffer which contains previous text
 """
 function process_node!(node::Gumbo.HTMLElement,
-    heading_hierarchy::Dict{Symbol,Any},
-    parsed_blocks::Vector{Dict{String,Any}},
-    child_new::Bool=true,
-    prev_text_buffer::IO=IOBuffer(write=true))
-
+        heading_hierarchy::Dict{Symbol, Any},
+        parsed_blocks::Vector{Dict{String, Any}},
+        child_new::Bool = true,
+        prev_text_buffer::IO = IOBuffer(write = true))
     tag_name = Gumbo.tag(node)
     if startswith(string(tag_name), "h") && isdigit(last(string(tag_name)))
         return process_headings!(node, heading_hierarchy, parsed_blocks)
@@ -292,15 +274,14 @@ function process_node!(node::Gumbo.HTMLElement,
         return process_code(node)
 
     elseif tag_name == :article && getattr(node, "class", "") == "docstring"
-        return process_docstring!(node, heading_hierarchy, parsed_blocks, child_new, prev_text_buffer)
-
+        return process_docstring!(
+            node, heading_hierarchy, parsed_blocks, child_new, prev_text_buffer)
     end
 
-    return process_generic_node!(node, heading_hierarchy, parsed_blocks, child_new, prev_text_buffer)
-
+    return process_generic_node!(
+        node, heading_hierarchy, parsed_blocks, child_new, prev_text_buffer)
 end
 
-
 """
 multiple dispatch for process_node!() when node is of type Gumbo.HTMLText
 """
@@ -310,14 +291,10 @@ function process_node!(node::Gumbo.HTMLText, args...)
     return strip(Gumbo.text(node)), is_code_block, is_text_inserted
 end
 
-
 """
     get_base_url(url::AbstractString)
 
-Extracts the base url.
-
-# Arguments
-- `url`: The url string of which, the base url needs to be extracted
+Extract the base url.
 """
 function get_base_url(url::AbstractString)
     parsed_url = URIs.URI(url)
@@ -329,7 +306,7 @@ end
 """
     get_html_content(root::Gumbo.HTMLElement)
 
-Returns the main content of the HTML. If not found, returns the whole HTML to parse
+Return the main content of the HTML. If not found, return the whole HTML to parse
 
 # Arguments
 - `root`: The HTML root from which content is extracted
@@ -338,73 +315,34 @@ function get_html_content(root::Gumbo.HTMLElement)
     target_ids = Set(["VPContent", "main_content_wrap", "pages-content"])
     target_classes = Set(["content", "franklin-content"])
 
-    content_candidates = [el for el in AbstractTrees.PreOrderDFS(root) if el isa HTMLElement]
+    content_candidates = [el
+                          for el in AbstractTrees.PreOrderDFS(root) if el isa HTMLElement]
 
     # First try to find by ID
-    content_by_id = filter(el -> getattr(el, "id", nothing) in target_ids, content_candidates)
+    content_by_id = filter(
+        el -> getattr(el, "id", nothing) in target_ids, content_candidates)
     if !isempty(content_by_id)
         return only(content_by_id)
     end
 
     # Fallback to class if no ID matches
-    content_by_class = filter(el -> getattr(el, "class", nothing) in target_classes, content_candidates)
+    content_by_class = filter(
+        el -> getattr(el, "class", nothing) in target_classes, content_candidates)
     if !isempty(content_by_class)
         return only(content_by_class)
     end
 
     # Fallback to the root node if no class matches
     return root
-
 end
 
-
 """
     parse_url(url::AbstractString)
 
-Initiator and main function to parse HTML from url
+Initiator and main function to parse HTML from url. Return a Vector of Dict containing Heading/Text/Code along with a Dict of respective metadata
 
 # Arguments
 - `url`: URL string to parse
-
-# Returns
-- A Vector of Dict containing Heading/Text/Code along with a Dict of respective metadata
-
-# Usage
-parsed_blocks = parse_url("https://docs.julialang.org/en/v1/base/multi-threading/")
-
-# Example
-Let the HTML be:
-<!DOCTYPE html>
-    <html>
-    <body>
-
-    <h1>Heading 1</h1>
-        <h2>Heading 2</h2>
-            <p>para 1</p>
-            <h3>Heading 3</h3>
-                <code>this is my code block</code>
-            <h3>This is another h3 under Heading 2</h3>
-                <p>This is a paragraph with <code>inline code</code></p>
-
-        <h2>Heading 2_2</h2>
-            <p>para ewg</p>
-
-    </body>
-    </html>
-
-Output: 
-Any[
-    Dict{String, Any}("URL" => "URL")
-    Dict{String, Any}("metadata" => Dict{Any, Any}("h1" => "Heading 1"), "heading" => "Heading 1")
-    Dict{String, Any}("metadata" => Dict{Any, Any}("h1" => "Heading 1", "h2" => "Heading 2"), "heading" => "Heading 2")
-    Dict{String, Any}("metadata" => Dict{Any, Any}("h1" => "Heading 1", "h2" => "Heading 2"), "text" => "para 1")
-    Dict{String, Any}("metadata" => Dict{Any, Any}("h1" => "Heading 1", "h3" => "Heading 3", "h2" => "Heading 2"), "heading" => "Heading 3")
-    Dict{String, Any}("code" => "```julia this is my code block```", "metadata" => Dict{Any, Any}("h1" => "Heading 1", "h3" => "Heading 3", "h2" => "Heading 2"))
-    Dict{String, Any}("metadata" => Dict{Any, Any}("h1" => "Heading 1", "h3" => "This is another h3 under Heading 2", "h2" => "Heading 2"), "heading" => "This is another h3 under Heading 2")
-    Dict{String, Any}("metadata" => Dict{Any, Any}("h1" => "Heading 1", "h3" => "This is another h3 under Heading 2", "h2" => "Heading 2"), "text" => "This is a paragraph with  inline code")
-    Dict{String, Any}("metadata" => Dict{Any, Any}("h1" => "Heading 1", "h2" => "Heading 2_2"), "heading" => "Heading 2_2")
-    Dict{String, Any}("metadata" => Dict{Any, Any}("h1" => "Heading 1", "h2" => "Heading 2_2"), "text" => "para ewg")
-]
 """
 function parse_url_to_blocks(url::AbstractString)
 
@@ -419,8 +357,8 @@ function parse_url_to_blocks(url::AbstractString)
         # title = [el
         #          for el in AbstractTrees.PreOrderDFS(r_parsed.root)
         #          if el isa HTMLElement && tag(el) == :title] .|> text |> Base.Fix2(join, " / ")
-        parsed_blocks = Vector{Dict{String,Any}}([Dict("Source" => base_url)])
-        heading_hierarchy = Dict{Symbol,Any}()
+        parsed_blocks = Vector{Dict{String, Any}}([Dict("Source" => base_url)])
+        heading_hierarchy = Dict{Symbol, Any}()
         process_node!(get_html_content(parsed.root), heading_hierarchy, parsed_blocks)
         return parsed_blocks
     catch
diff --git a/src/preparation.jl b/src/preparation.jl
index ab8d7b5..9979155 100644
--- a/src/preparation.jl
+++ b/src/preparation.jl
@@ -1,9 +1,7 @@
-# include("recursive_splitter.jl")
-include("utils.jl")
 """
     get_header_path(d::Dict)
 
-Concatenates the h1, h2, h3 keys from the metadata of a Dict
+Concatenate the h1, h2, h3 keys from the metadata of a Dict
 
 # Examples
 ```julia
@@ -12,7 +10,7 @@ get_header_path(d)
 # Output: "Axis/Attributes/yzoomkey"
 ```
 """
-function get_header_path(d::Dict)
+function get_header_path(d::Dict{String,Any})
     metadata = get(d, "metadata", Dict{Any,Any}())
     isempty(metadata) && return nothing
     keys_ = [:h1, :h2, :h3]
@@ -21,8 +19,13 @@ function get_header_path(d::Dict)
 end
 
 
-"Roll-up chunks (that have the same header!), so we can split them later by <SEP> to get the desired length"
-function roll_up_chunks(parsed_blocks, url::AbstractString; separator::String="<SEP>")
+
+"""
+    roll_up_chunks(parsed_blocks::Vector{Dict{String,Any}}, url::AbstractString; separator::String="<SEP>")
+
+Roll-up chunks (that have the same header!), so we can split them later by <SEP> to get the desired length
+"""
+function roll_up_chunks(parsed_blocks::Vector{Dict{String,Any}}, url::AbstractString; separator::String="<SEP>")
     docs = String[]
     io = IOBuffer()
     last_header = nothing
@@ -35,7 +38,7 @@ function roll_up_chunks(parsed_blocks, url::AbstractString; separator::String="<
             str = String(take!(io))
             if !isempty(str)
                 push!(docs, str)
-                src = url * (isnothing(last_header) ? "" : "::$last_header")
+                src = url * (isnothing(last_header) ? "" : " - $last_header")
                 push!(sources, src)
             end
             last_header = header
@@ -48,7 +51,7 @@ function roll_up_chunks(parsed_blocks, url::AbstractString; separator::String="<
     str = String(take!(io))
     if !isempty(str)
         push!(docs, str)
-        src = url * (isnothing(last_header) ? "" : "::$last_header")
+        src = url * (isnothing(last_header) ? "" : " - $last_header")
         push!(sources, src)
     end
     return docs, sources
@@ -56,19 +59,23 @@ end
 
 
 struct DocParserChunker <: RT.AbstractChunker end
-"""
-    RT.get_chunks(chunker::DocParserChunker,
-    html_files::Vector{<:AbstractString};
-    sources::AbstractVector{<:AbstractString}=html_files,
-    verbose::Bool=true,
-    separators=["\n\n", ". ", "\n", " "], max_length::Int=256)
 
-Extracts chunks from HTML files, by parsing the content in the HTML, rolling up chunks by headers, and splits them by separators to get the desired length.
+"""
+    RT.get_chunks(chunker::DocParserChunker, url::AbstractString;
+        verbose::Bool=true, separators=["\n\n", ". ", "\n", " "], max_chunk_size::Int=MAX_CHUNK_SIZE)
+
+Extract chunks from HTML files, by parsing the content in the HTML, rolling up chunks by headers, 
+and splits them by separators to get the desired length.
+
+# Arguments
+- chunker: DocParserChunker
+- url: URL of the webpage to extract chunks
+- verbose: Bool to print the log
+- separators: Chunk separators
+- max_chunk_size Maximum chunk size
 """
 function RT.get_chunks(chunker::DocParserChunker, url::AbstractString;
-    verbose::Bool=true,
-    separators=["\n\n", ". ", "\n", " "], max_length::Int=256)
-
+    verbose::Bool=true, separators=["\n\n", ". ", "\n", " "], max_chunk_size::Int=MAX_CHUNK_SIZE)
 
     SEP = "<SEP>"
     sources = AbstractVector{<:AbstractString}
@@ -84,8 +91,9 @@ function RT.get_chunks(chunker::DocParserChunker, url::AbstractString;
     ## roll up chunks by SEP splitter, then remove it later
     for (doc, src) in zip(docs_, sources_)
         ## roll up chunks by SEP splitter, then remove it later
-        doc_chunks = PT.recursive_splitter(doc, [SEP, separators...]; max_length) .|>
+        doc_chunks = PT.recursive_splitter(doc, [SEP, separators...]; max_length=max_chunk_size) .|>
                      x -> replace(x, SEP => " ") .|> strip |> x -> filter(!isempty, x)
+        chunk_lengths = length.(doc_chunks)
         # skip if no chunks found
         isempty(doc_chunks) && continue
         append!(output_chunks, doc_chunks)
@@ -96,20 +104,24 @@ end
 
 
 
-"Process folders provided in `paths`. In each, take all HTML files, scrape them, chunk them and postprocess them."
-function process_paths(url::AbstractString, max_length::Int=512)
+"""
+    process_paths(url::AbstractString; max_chunk_size::Int=MAX_CHUNK_SIZE, min_chunk_size::Int=MIN_CHUNK_SIZE)
+
+Process folders provided in `paths`. In each, take all HTML files, scrape them, chunk them and postprocess them.
+"""
+function process_paths(url::AbstractString; max_chunk_size::Int=MAX_CHUNK_SIZE, min_chunk_size::Int=MIN_CHUNK_SIZE)
 
     output_chunks = Vector{SubString{String}}()
     output_sources = Vector{String}()
 
-    chunks, sources = RT.get_chunks(DocParserChunker(), url; max_length)
+    chunks, sources = RT.get_chunks(DocParserChunker(), url; max_chunk_size)
 
     append!(output_chunks, chunks)
     append!(output_sources, sources)
 
 
     @info "Scraping done: $(length(output_chunks)) chunks"
-    postprocess_chunks(output_chunks, output_sources; min_length=40, skip_code=true)
+    output_chunks, output_sources = postprocess_chunks(output_chunks, output_sources; min_chunk_size, skip_code=true)
 
     return output_chunks, output_sources
 end
diff --git a/src/user_preferences.jl b/src/user_preferences.jl
new file mode 100644
index 0000000..98794c6
--- /dev/null
+++ b/src/user_preferences.jl
@@ -0,0 +1,4 @@
+global MIN_CHUNK_SIZE = 40
+global MAX_CHUNK_SIZE = 256
+global MODEL = "text-embedding-3-large"
+global EMBEDDING_SIZE = 1024
\ No newline at end of file
diff --git a/src/utils.jl b/src/utils.jl
index 4bf1e07..e8dc014 100644
--- a/src/utils.jl
+++ b/src/utils.jl
@@ -1,4 +1,9 @@
-"Finds duplicates in a list of chunks using SHA-256 hash. Returns a bit vector of the same length as the input list, where `true` indicates a duplicate (second instance of the same text)."
+"""
+    find_duplicates(chunks::AbstractVector{<:AbstractString})
+
+Find duplicates in a list of chunks using SHA-256 hash. Returns a bit vector of the same length as the input list, 
+where `true` indicates a duplicate (second instance of the same text).
+"""
 function find_duplicates(chunks::AbstractVector{<:AbstractString})
     # hash the chunks for easier search
     hashed_chunks = bytes2hex.(sha256.(chunks))
@@ -20,20 +25,34 @@ function find_duplicates(chunks::AbstractVector{<:AbstractString})
     return duplicates
 end
 
-"Removes chunks that are duplicated in the input list of chunks and their corresponding sources."
+"""
+    remove_duplicates(chunks::AbstractVector{<:AbstractString}, sources::AbstractVector{<:AbstractString})
+
+Remove chunks that are duplicated in the input list of chunks and their corresponding sources.
+"""
 function remove_duplicates(chunks::AbstractVector{<:AbstractString}, sources::AbstractVector{<:AbstractString})
     idxs = find_duplicates(chunks)
     return chunks[.!idxs], sources[.!idxs]
 end
 
-"Removes chunks that are shorter than a specified length (`min_length`) from the input list of chunks and their corresponding sources."
-function remove_short_chunks(chunks::AbstractVector{<:AbstractString}, sources::AbstractVector{<:AbstractString}; min_length::Int=40, skip_code::Bool=true)
+
+"""
+    remove_short_chunks(chunks::AbstractVector{<:AbstractString}, sources::AbstractVector{<:AbstractString};
+        min_chunk_size::Int=MIN_CHUNK_SIZE, skip_code::Bool=true)
+
+Remove chunks that are shorter than a specified length (`min_length`) from the input list of chunks and their corresponding sources.
+"""
+function remove_short_chunks(chunks::AbstractVector{<:AbstractString}, sources::AbstractVector{<:AbstractString};
+    min_chunk_size::Int=MIN_CHUNK_SIZE, skip_code::Bool=true)
+
+    chunk_lengths = length.(chunks)
     idx = if skip_code
-        ## Keep short chunks if they contain code (might be combined with some preceding/suceeeding text)
-        findall(x -> length(x) >= min_length || occursin("```", x), chunks)
+        ## Keep short chunks if they contain code (might be combined with some preceding/succeeding text)
+        findall(x -> length(x) >= min_chunk_size || occursin("```", x), chunks)
     else
-        findall(x -> length(x) >= min_length, chunks)
+        findall(x -> length(x) >= min_chunk_size, chunks)
     end
+    chunk_lengths = length.(chunks[idx])
     return chunks[idx], sources[idx]
 end
 
@@ -42,14 +61,24 @@ function replace_local_paths(sources::AbstractVector{<:AbstractString}, paths::A
     @assert length(paths) == length(websites) "Length of `paths` must match length of `websites`"
     replacement_pairs = paths .=> websites
     output = map(x -> replace(x, replacement_pairs...), sources)
+    return output
 end
 
 
-"Post-processes the input list of chunks and their corresponding sources by removing short chunks and duplicates."
-function postprocess_chunks(chunks::AbstractVector{<:AbstractString}, sources::AbstractVector{<:AbstractString}; min_length::Int=40, skip_code::Bool=true,
-    paths::Union{Nothing,AbstractVector{<:AbstractString}}=nothing, websites::Union{Nothing,AbstractVector{<:AbstractString}}=nothing)
+
+
+"""
+    function postprocess_chunks(chunks::AbstractVector{<:AbstractString}, sources::AbstractVector{<:AbstractString};
+        min_chunk_size::Int=MIN_CHUNK_SIZE, skip_code::Bool=true, paths::Union{Nothing,AbstractVector{<:AbstractString}}=nothing,
+        websites::Union{Nothing,AbstractVector{<:AbstractString}}=nothing)
+
+Post-process the input list of chunks and their corresponding sources by removing short chunks and duplicates.
+"""
+function postprocess_chunks(chunks::AbstractVector{<:AbstractString}, sources::AbstractVector{<:AbstractString};
+    min_chunk_size::Int=MIN_CHUNK_SIZE, skip_code::Bool=true, paths::Union{Nothing,AbstractVector{<:AbstractString}}=nothing,
+    websites::Union{Nothing,AbstractVector{<:AbstractString}}=nothing)
     len_ = length(chunks)
-    chunks, sources = remove_short_chunks(chunks, sources; min_length, skip_code)
+    chunks, sources = remove_short_chunks(chunks, sources; min_chunk_size, skip_code)
     @info "Removed $(len_ - length(chunks)) short chunks"
 
     len_ = length(chunks)
@@ -63,6 +92,31 @@ function postprocess_chunks(chunks::AbstractVector{<:AbstractString}, sources::A
     end
 
     return chunks, sources
+end
+
+"""
+    function remove_urls_from_index(index_path::AbstractString, prefix_urls=Vector{<:AbstractString})
+
+Remove chunks and sources corresponding to URLs starting with `prefix_urls` 
+"""
+function remove_urls_from_index(index_path::AbstractString, prefix_urls=Vector{<:AbstractString})
+    @assert endswith(file_path, ".hdf5") "Provided file path must end with `.hdf5` (see HDF5.jl)."
 
+    h5open(index_path, "r+") do orig_file
+        # Load the sources dataset into a Julia array
+        sources = read(orig_file["sources"])
+        chunks = read(orig_file["chunks"])
+        embeddings = read(orig_file["embeddings"])
 
+        for url_to_remove in prefix_urls
+            indices_to_remove = findall(x -> startswith(x, url_to_remove), sources)
+            sources = deleteat!(sources, indices_to_remove)
+            chunks = deleteat!(chunks, indices_to_remove)
+            embeddings = embeddings[:, setdiff(1:size(embeddings, 2), indices_to_remove)]
+        end
+
+        write(file["sources"], sources)
+        write(file["chunks"], chunks)
+        write(file["embeddings"], embeddings)
+    end
 end
\ No newline at end of file
diff --git a/test/runtests.jl b/test/runtests.jl
index 78a78b4..4b4a92c 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -10,21 +10,22 @@ using LinearAlgebra, Unicode, SparseArrays
 using HDF5
 using Tar
 using Inflate
-
 using SHA
 using Serialization, URIs
 
-include("..\\src\\crawl.jl")
-include("..\\src\\extract_urls.jl")
-include("..\\src\\parser.jl")
-include("..\\src\\preparation.jl")
+include(joinpath("..", "src", "crawl.jl"))
+include(joinpath("..", "src", "extract_urls.jl"))
+include(joinpath("..", "src", "parser.jl"))
+include(joinpath("..", "src", "preparation.jl"))
+include(joinpath("..", "src", "user_preferences.jl"))
+include(joinpath("..", "src", "utils.jl"))
+
 urls = Vector{AbstractString}(["https://docs.julialang.org/en/v1/"])
 url = urls[1]
 queue = Vector{AbstractString}()
 
-@testset "check robots.txt" begin
+@testset "HTTP" begin
     @test HTTP.get(url) != nothing
-
     result, sitemap_queue = check_robots_txt("*", url)
     @test result == true
 end
@@ -38,12 +39,13 @@ end
     parsed_blocks = parse_url_to_blocks(url)
     @test length(parsed_blocks) > 0
     SEP = "<SEP>"
-    docs_, sources_ = roll_up_chunks(parsed_blocks, url; separator=SEP)
-    @test length(docs_) > 0 && length(sources_) > 0 && docs_[1] != nothing && sources_[1] != nothing
+    docs_, sources_ = roll_up_chunks(parsed_blocks, url; separator = SEP)
+    @test length(docs_) > 0 && length(sources_) > 0 && docs_[1] != nothing &&
+          sources_[1] != nothing
 end
 
 @testset "overall test" begin
     chunks, sources = process_paths(url)
-    @test length(chunks) > 0 && length(sources) > 0 && chunks[1] != nothing && sources[1] != nothing
-
+    @test length(chunks) > 0 && length(sources) > 0 && chunks[1] != nothing &&
+          sources[1] != nothing
 end

From 0782e01ba709f18bda1cca77cb83e8b73922630e Mon Sep 17 00:00:00 2001
From: Shreyas Agrawal <48771895+splendidbug@users.noreply.github.com>
Date: Sun, 11 Aug 2024 03:19:20 -0700
Subject: [PATCH 3/7] dependency changes

---
 .github/workflows/CI.yml | 1 -
 Project.toml             | 4 ++++
 src/DocsScraper.jl       | 3 ---
 3 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/.github/workflows/CI.yml b/.github/workflows/CI.yml
index 874943f..0b6af25 100644
--- a/.github/workflows/CI.yml
+++ b/.github/workflows/CI.yml
@@ -24,7 +24,6 @@ jobs:
       matrix:
         version:
           - "1.10"
-          - "nightly"
         os:
           - ubuntu-latest
         arch:
diff --git a/Project.toml b/Project.toml
index 705a918..bc05f3f 100644
--- a/Project.toml
+++ b/Project.toml
@@ -14,6 +14,8 @@ PromptingTools = "670122d1-24a8-4d70-bfce-740807c42192"
 Tar = "a4e569a6-e804-4fa4-b0f3-eef7a1d5b13e"
 URIParser = "30578b45-9adc-5946-b283-645ec420af67"
 URIs = "5c2747f8-b7ea-4ff2-ba2e-563bfd36b1d4"
+LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
+SparseArrays = "2f01184e-e22b-5df5-ae63-d93ebab69eaf"
 
 [compat]
 AbstractTrees = "0.4.5"
@@ -26,6 +28,8 @@ PromptingTools = "0.36.0"
 URIParser = "0.4.1"
 URIs = "1.5.1"
 Tar = "1.10.0"
+LinearAlgebra = "<0.0.1, 1"
+SparseArrays = "<0.0.1, 1"
 
 [extras]
 Aqua = "4c88cf16-eb10-579e-8560-4a9242c79595"
diff --git a/src/DocsScraper.jl b/src/DocsScraper.jl
index e78dde7..40bc3ee 100644
--- a/src/DocsScraper.jl
+++ b/src/DocsScraper.jl
@@ -2,8 +2,6 @@ module DocsScraper
 using HTTP, Gumbo, AbstractTrees, URIs
 using Gumbo: HTMLDocument, HTMLElement
 using EzXML
-using Pkg
-Pkg.develop(PackageSpec(path="C:\\Users\\shrey\\Desktop\\stuff\\assignments\\grad\\projects\\Julia\\PromptingTools.jl"))
 using PromptingTools
 const PT = PromptingTools
 const RT = PromptingTools.Experimental.RAGTools
@@ -27,5 +25,4 @@ include("user_preferences.jl")
 include("utils.jl")
 export remove_urls_from_index
 
-
 end
\ No newline at end of file

From 52998a90e7f8373d879537e8242309a93c227a39 Mon Sep 17 00:00:00 2001
From: Shreyas Agrawal <48771895+splendidbug@users.noreply.github.com>
Date: Sun, 11 Aug 2024 03:30:37 -0700
Subject: [PATCH 4/7] dependency changes

---
 .github/workflows/CI.yml | 62 ++++++++++++++++++++--------------------
 Project.toml             | 15 ++++++----
 2 files changed, 41 insertions(+), 36 deletions(-)

diff --git a/.github/workflows/CI.yml b/.github/workflows/CI.yml
index 0b6af25..1c00a7a 100644
--- a/.github/workflows/CI.yml
+++ b/.github/workflows/CI.yml
@@ -43,34 +43,34 @@ jobs:
           files: lcov.info
           token: ${{ secrets.CODECOV_TOKEN }}
           fail_ci_if_error: false
-  docs:
-    name: Documentation
-    runs-on: ubuntu-latest
-    permissions:
-      actions: write # needed to allow julia-actions/cache to proactively delete old caches that it has created
-      contents: write
-      statuses: write
-    steps:
-      - uses: actions/checkout@v4
-      - uses: julia-actions/setup-julia@v2
-        with:
-          version: "1"
-      - uses: julia-actions/cache@v2
-      - name: Configure doc environment
-        shell: julia --project=docs --color=yes {0}
-        run: |
-          using Pkg
-          Pkg.develop(PackageSpec(path=pwd()))
-          Pkg.instantiate()
-      - uses: julia-actions/julia-buildpkg@v1
-      - uses: julia-actions/julia-docdeploy@v1
-        env:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-          DOCUMENTER_KEY: ${{ secrets.DOCUMENTER_KEY }}
-      - name: Run doctests
-        shell: julia --project=docs --color=yes {0}
-        run: |
-          using Documenter: DocMeta, doctest
-          using DocsScraper
-          DocMeta.setdocmeta!(DocsScraper, :DocTestSetup, :(using DocsScraper); recursive=true)
-          doctest(DocsScraper)
+  # docs:
+  #   name: Documentation
+  #   runs-on: ubuntu-latest
+  #   permissions:
+  #     actions: write # needed to allow julia-actions/cache to proactively delete old caches that it has created
+  #     contents: write
+  #     statuses: write
+  #   steps:
+  #     - uses: actions/checkout@v4
+  #     - uses: julia-actions/setup-julia@v2
+  #       with:
+  #         version: "1"
+  #     - uses: julia-actions/cache@v2
+  #     - name: Configure doc environment
+  #       shell: julia --project=docs --color=yes {0}
+  #       run: |
+  #         using Pkg
+  #         Pkg.develop(PackageSpec(path=pwd()))
+  #         Pkg.instantiate()
+  #     - uses: julia-actions/julia-buildpkg@v1
+  #     - uses: julia-actions/julia-docdeploy@v1
+  #       env:
+  #         GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+  #         DOCUMENTER_KEY: ${{ secrets.DOCUMENTER_KEY }}
+  #     - name: Run doctests
+  #       shell: julia --project=docs --color=yes {0}
+  #       run: |
+  #         using Documenter: DocMeta, doctest
+  #         using DocsScraper
+  #         DocMeta.setdocmeta!(DocsScraper, :DocTestSetup, :(using DocsScraper); recursive=true)
+  #         doctest(DocsScraper)
diff --git a/Project.toml b/Project.toml
index bc05f3f..16502d1 100644
--- a/Project.toml
+++ b/Project.toml
@@ -10,12 +10,15 @@ Gumbo = "708ec375-b3d6-5a57-a7ce-8257bf98657a"
 HDF5 = "f67ccb44-e63f-5c2f-98bd-6dc0ccc4ba2f"
 HTTP = "cd3eb016-35fb-5094-929b-558a96fad6f3"
 Inflate = "d25df0c9-e2be-5dd7-82c8-3ad0b3e990b9"
+LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
 PromptingTools = "670122d1-24a8-4d70-bfce-740807c42192"
+SHA = "ea8e919c-243c-51af-8825-aaa63cd721ce"
+Serialization = "9e88b42a-f829-5b0c-bbe9-9e923198166b"
+SparseArrays = "2f01184e-e22b-5df5-ae63-d93ebab69eaf"
 Tar = "a4e569a6-e804-4fa4-b0f3-eef7a1d5b13e"
 URIParser = "30578b45-9adc-5946-b283-645ec420af67"
 URIs = "5c2747f8-b7ea-4ff2-ba2e-563bfd36b1d4"
-LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
-SparseArrays = "2f01184e-e22b-5df5-ae63-d93ebab69eaf"
+Unicode = "4ec0a83e-493e-50e2-b9ac-8f72acf5a8f5"
 
 [compat]
 AbstractTrees = "0.4.5"
@@ -24,12 +27,14 @@ Gumbo = "0.8.2"
 HDF5 = "0.17.2"
 HTTP = "1.10.4"
 Inflate = "0.1.5"
+LinearAlgebra = "<0.0.1, 1"
 PromptingTools = "0.36.0"
+SparseArrays = "<0.0.1, 1"
+Tar = "1.10.0"
 URIParser = "0.4.1"
 URIs = "1.5.1"
-Tar = "1.10.0"
-LinearAlgebra = "<0.0.1, 1"
-SparseArrays = "<0.0.1, 1"
+SHA = "0.7.0"
+
 
 [extras]
 Aqua = "4c88cf16-eb10-579e-8560-4a9242c79595"

From 6f32002254aaa8ff829225fb9d1a92cd1980398c Mon Sep 17 00:00:00 2001
From: Shreyas Agrawal <48771895+splendidbug@users.noreply.github.com>
Date: Wed, 14 Aug 2024 15:00:36 -0700
Subject: [PATCH 5/7] code imptrovements

---
 .JuliaFormatter.toml         |   1 +
 .github/workflows/CI.yml     |  70 +++++++--------
 .gitignore                   |   4 +-
 .vscode/settings.json        |   6 --
 LICENSE                      |   2 +-
 Project.toml                 |  38 ++++----
 docs/Project.toml            |  10 +--
 docs/make.jl                 |  29 ++++---
 docs/src/index.md            |   8 +-
 src/DocsScraper.jl           |  11 ++-
 src/crawl.jl                 |  18 ++--
 src/extract_package_name.jl  | 162 +++++++++++++++++++++++++++++++++++
 src/extract_urls.jl          |   2 +-
 src/make_knowledge_packs.jl  |  69 ++++++++++-----
 src/parser.jl                |   5 +-
 src/preparation.jl           |  33 ++++---
 src/user_preferences.jl      |   4 +-
 src/utils.jl                 |  72 ++++++++++++----
 test/crawl.jl                |   7 ++
 test/make_knowledge_packs.jl |   8 ++
 test/parser.jl               |  11 +++
 test/runtests.jl             |  56 ++----------
 test/utils.jl                |  10 +++
 23 files changed, 427 insertions(+), 209 deletions(-)
 delete mode 100644 .vscode/settings.json
 create mode 100644 src/extract_package_name.jl
 create mode 100644 test/crawl.jl
 create mode 100644 test/make_knowledge_packs.jl
 create mode 100644 test/parser.jl
 create mode 100644 test/utils.jl

diff --git a/.JuliaFormatter.toml b/.JuliaFormatter.toml
index 5657bd0..9601a61 100644
--- a/.JuliaFormatter.toml
+++ b/.JuliaFormatter.toml
@@ -1,2 +1,3 @@
 # See https://domluna.github.io/JuliaFormatter.jl/stable/ for a list of options
 style = "sciml"
+ignore = ["knowledge_packs"]
\ No newline at end of file
diff --git a/.github/workflows/CI.yml b/.github/workflows/CI.yml
index 1c00a7a..5cd2adb 100644
--- a/.github/workflows/CI.yml
+++ b/.github/workflows/CI.yml
@@ -38,39 +38,39 @@ jobs:
       - uses: julia-actions/julia-buildpkg@v1
       - uses: julia-actions/julia-runtest@v1
       - uses: julia-actions/julia-processcoverage@v1
-      - uses: codecov/codecov-action@v4
+      # - uses: codecov/codecov-action@v4
+      #   with:
+      #     files: lcov.info
+      #     token: ${{ secrets.CODECOV_TOKEN }}
+      #     fail_ci_if_error: false
+  docs:
+    name: Documentation
+    runs-on: ubuntu-latest
+    permissions:
+      actions: write # needed to allow julia-actions/cache to proactively delete old caches that it has created
+      contents: write
+      statuses: write
+    steps:
+      - uses: actions/checkout@v4
+      - uses: julia-actions/setup-julia@v2
         with:
-          files: lcov.info
-          token: ${{ secrets.CODECOV_TOKEN }}
-          fail_ci_if_error: false
-  # docs:
-  #   name: Documentation
-  #   runs-on: ubuntu-latest
-  #   permissions:
-  #     actions: write # needed to allow julia-actions/cache to proactively delete old caches that it has created
-  #     contents: write
-  #     statuses: write
-  #   steps:
-  #     - uses: actions/checkout@v4
-  #     - uses: julia-actions/setup-julia@v2
-  #       with:
-  #         version: "1"
-  #     - uses: julia-actions/cache@v2
-  #     - name: Configure doc environment
-  #       shell: julia --project=docs --color=yes {0}
-  #       run: |
-  #         using Pkg
-  #         Pkg.develop(PackageSpec(path=pwd()))
-  #         Pkg.instantiate()
-  #     - uses: julia-actions/julia-buildpkg@v1
-  #     - uses: julia-actions/julia-docdeploy@v1
-  #       env:
-  #         GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-  #         DOCUMENTER_KEY: ${{ secrets.DOCUMENTER_KEY }}
-  #     - name: Run doctests
-  #       shell: julia --project=docs --color=yes {0}
-  #       run: |
-  #         using Documenter: DocMeta, doctest
-  #         using DocsScraper
-  #         DocMeta.setdocmeta!(DocsScraper, :DocTestSetup, :(using DocsScraper); recursive=true)
-  #         doctest(DocsScraper)
+          version: "1"
+      - uses: julia-actions/cache@v2
+      - name: Configure doc environment
+        shell: julia --project=docs --color=yes {0}
+        run: |
+          using Pkg
+          Pkg.develop(PackageSpec(path=pwd()))
+          Pkg.instantiate()
+      - uses: julia-actions/julia-buildpkg@v1
+      - uses: julia-actions/julia-docdeploy@v1
+        env:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+          DOCUMENTER_KEY: ${{ secrets.DOCUMENTER_KEY }}
+      - name: Run doctests
+        shell: julia --project=docs --color=yes {0}
+        run: |
+          using Documenter: DocMeta, doctest
+          using DocsScraper
+          DocMeta.setdocmeta!(DocsScraper, :DocTestSetup, :(using DocsScraper); recursive=true)
+          doctest(DocsScraper)
diff --git a/.gitignore b/.gitignore
index 8e2d4ba..4a1c7f4 100644
--- a/.gitignore
+++ b/.gitignore
@@ -4,4 +4,6 @@ knowledge_packs/
 Manifest.toml
 /Manifest.toml
 /docs/Manifest.toml
-/docs/build/
\ No newline at end of file
+/docs/build/
+.vscode/**
+**/.DS_Store
\ No newline at end of file
diff --git a/.vscode/settings.json b/.vscode/settings.json
deleted file mode 100644
index 9238ca7..0000000
--- a/.vscode/settings.json
+++ /dev/null
@@ -1,6 +0,0 @@
-{
-  "cSpell.words": [
-    "eachmatch",
-    "postprocess"
-  ]
-}
diff --git a/LICENSE b/LICENSE
index d7bd022..183f1b7 100644
--- a/LICENSE
+++ b/LICENSE
@@ -1,6 +1,6 @@
 MIT License
 
-Copyright (c) Shreyas Agrawal @splendidbug and J S @svilupp
+Copyright (c) Shreyas Agrawal @splendidbug and contributors
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
diff --git a/Project.toml b/Project.toml
index 16502d1..1fb77c2 100644
--- a/Project.toml
+++ b/Project.toml
@@ -1,40 +1,46 @@
 name = "DocsScraper"
 uuid = "bd71d052-5e08-40cc-a492-eb4e8da4b649"
-authors = ["Shreyas Agrawal @splendidbug and J S @svilupp"]
+authors = ["Shreyas Agrawal @splendidbug  and contributors"]
 version = "0.1.0"
 
 [deps]
 AbstractTrees = "1520ce14-60c1-5f80-bbc7-55ef81b5835c"
+Dates = "ade2ca70-3891-5945-98fb-dc099432e06a"
 EzXML = "8f5d6c58-4d21-5cfd-889c-e3ad7ee6a615"
 Gumbo = "708ec375-b3d6-5a57-a7ce-8257bf98657a"
 HDF5 = "f67ccb44-e63f-5c2f-98bd-6dc0ccc4ba2f"
 HTTP = "cd3eb016-35fb-5094-929b-558a96fad6f3"
 Inflate = "d25df0c9-e2be-5dd7-82c8-3ad0b3e990b9"
+JSON = "682c06a0-de6a-54ab-a142-c8b1cf79cde6"
 LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
 PromptingTools = "670122d1-24a8-4d70-bfce-740807c42192"
 SHA = "ea8e919c-243c-51af-8825-aaa63cd721ce"
 Serialization = "9e88b42a-f829-5b0c-bbe9-9e923198166b"
 SparseArrays = "2f01184e-e22b-5df5-ae63-d93ebab69eaf"
 Tar = "a4e569a6-e804-4fa4-b0f3-eef7a1d5b13e"
-URIParser = "30578b45-9adc-5946-b283-645ec420af67"
 URIs = "5c2747f8-b7ea-4ff2-ba2e-563bfd36b1d4"
 Unicode = "4ec0a83e-493e-50e2-b9ac-8f72acf5a8f5"
 
 [compat]
-AbstractTrees = "0.4.5"
-EzXML = "1.2.0"
-Gumbo = "0.8.2"
-HDF5 = "0.17.2"
-HTTP = "1.10.4"
-Inflate = "0.1.5"
-LinearAlgebra = "<0.0.1, 1"
-PromptingTools = "0.36.0"
-SparseArrays = "<0.0.1, 1"
-Tar = "1.10.0"
-URIParser = "0.4.1"
-URIs = "1.5.1"
-SHA = "0.7.0"
-
+AbstractTrees = "0.4"
+Aqua = "0.8"
+Dates = "1"
+EzXML = "1.2"
+Gumbo = "0.8"
+HDF5 = "0.17"
+HTTP = "1.10"
+Inflate = "0.1"
+LinearAlgebra = "1"
+PromptingTools = "0.48"
+SHA = "0.7"
+Serialization = "1"
+SparseArrays = "1"
+Tar = "1"
+Test = "1"
+URIs = "1.5"
+Unicode = "1"
+julia = "1.10"
+JSON = "0.21"
 
 [extras]
 Aqua = "4c88cf16-eb10-579e-8560-4a9242c79595"
diff --git a/docs/Project.toml b/docs/Project.toml
index 41b0b18..15c39b1 100644
--- a/docs/Project.toml
+++ b/docs/Project.toml
@@ -1,19 +1,15 @@
 [deps]
 AbstractTrees = "1520ce14-60c1-5f80-bbc7-55ef81b5835c"
+DocsScraper = "bd71d052-5e08-40cc-a492-eb4e8da4b649"
+Documenter = "e30172f5-a6a5-5a46-863b-614d45cd2de4"
 DotEnv = "4dc1fcf4-5e3b-5448-94ab-0c38ec0385c1"
 EzXML = "8f5d6c58-4d21-5cfd-889c-e3ad7ee6a615"
 Gumbo = "708ec375-b3d6-5a57-a7ce-8257bf98657a"
 HDF5 = "f67ccb44-e63f-5c2f-98bd-6dc0ccc4ba2f"
 HTTP = "cd3eb016-35fb-5094-929b-558a96fad6f3"
 Inflate = "d25df0c9-e2be-5dd7-82c8-3ad0b3e990b9"
+LiveServer = "16fef848-5104-11e9-1b77-fb7a48bbb589"
 PromptingTools = "670122d1-24a8-4d70-bfce-740807c42192"
 Tar = "a4e569a6-e804-4fa4-b0f3-eef7a1d5b13e"
 URIParser = "30578b45-9adc-5946-b283-645ec420af67"
 URIs = "5c2747f8-b7ea-4ff2-ba2e-563bfd36b1d4"
-
-[compat]
-AbstractTrees = "0.4.5"
-Gumbo = "0.8.2"
-HTTP = "1.10.4"
-PromptingTools = "0.36.0"
-URIs = "1.5.1"
diff --git a/docs/make.jl b/docs/make.jl
index a54f0f6..47bd6f5 100644
--- a/docs/make.jl
+++ b/docs/make.jl
@@ -1,23 +1,24 @@
 using DocsScraper
 using Documenter
 
-DocMeta.setdocmeta!(DocsScraper, :DocTestSetup, :(using DocsScraper); recursive=true)
+DocMeta.setdocmeta!(DocsScraper, :DocTestSetup, :(using DocsScraper); recursive = true)
 
 makedocs(;
-    modules=[DocsScraper],
-    authors="Shreyas Agrawal @splendidbug and J S @svilupp",
-    sitename="DocsScraper.jl",
-    # format=Documenter.HTML(;
-    #     canonical="https://Shreyas Agrawal.github.io/DocsScraper.jl",
-    #     edit_link="master",
-    #     assets=String[],
-    # ),
-    pages=[
-        "Home" => "index.md",
-    ],
+    modules = [DocsScraper],
+    authors = "Shreyas Agrawal @splendidbug  and contributors",
+    sitename = "DocsScraper.jl",
+    repo = "https://github.com/splendidbug/DocsScraper.jl/blob/{commit}{path}#{line}",
+    format = Documenter.HTML(;
+        repolink = "https://github.com/splendidbug/DocsScraper.jl",
+        canonical = "https://splendidbug.github.io/DocsScraper.jl",
+        edit_link = "main",
+        assets = String[]),
+    pages = [
+        "API Index" => "index.md"
+    ]
 )
 
 deploydocs(;
-    repo="github.com/Shreyas Agrawal/DocsScraper.jl",
-    devbranch="main",
+    repo = "github.com/splendidbug/DocsScraper.jl",
+    devbranch = "main"
 )
diff --git a/docs/src/index.md b/docs/src/index.md
index a6f0129..c30e1af 100644
--- a/docs/src/index.md
+++ b/docs/src/index.md
@@ -1,4 +1,8 @@
-# DocsScraper
+# Reference
 
-## Documentation
+```@index
+```
 
+```@autodocs
+Modules = [DocsScraper]
+```
\ No newline at end of file
diff --git a/src/DocsScraper.jl b/src/DocsScraper.jl
index 40bc3ee..7f114d9 100644
--- a/src/DocsScraper.jl
+++ b/src/DocsScraper.jl
@@ -9,20 +9,23 @@ using LinearAlgebra, Unicode, SparseArrays
 using HDF5
 using Tar
 using Inflate
-
 using SHA
 using Serialization, URIs
+using Dates
+using JSON
 
 include("parser.jl")
 include("crawl.jl")
 include("extract_urls.jl")
 include("preparation.jl")
+include("extract_package_name.jl")
+export get_package_name
 
 include("make_knowledge_packs.jl")
-export make_knowledge_packs, just_generate
+export make_knowledge_packs
 
 include("user_preferences.jl")
 include("utils.jl")
-export remove_urls_from_index
+export remove_urls_from_index, urls_for_metadata
 
-end
\ No newline at end of file
+end
diff --git a/src/crawl.jl b/src/crawl.jl
index a8f93c9..c972ef2 100644
--- a/src/crawl.jl
+++ b/src/crawl.jl
@@ -5,7 +5,7 @@
 Parse the robots.txt string and return rules and the URLs on Sitemap
 """
 function parse_robots_txt!(robots_txt::String)
-    rules = Dict{String,Dict{String,Vector{String}}}()
+    rules = Dict{String, Dict{String, Vector{String}}}()
     current_user_agent = ""
     sitemap_urls = Vector{AbstractString}()
 
@@ -14,7 +14,8 @@ function parse_robots_txt!(robots_txt::String)
         if startswith(line, "User-agent:")
             current_user_agent = strip(split(line, ":")[2])
             if !haskey(rules, current_user_agent)
-                rules[current_user_agent] = Dict("Disallow" => Vector{String}(), "Allow" => Vector{String}())
+                rules[current_user_agent] = Dict(
+                    "Disallow" => Vector{String}(), "Allow" => Vector{String}())
             end
         elseif startswith(line, "Disallow:")
             disallow_path = strip(split(line, ":")[2])
@@ -30,12 +31,10 @@ function parse_robots_txt!(robots_txt::String)
             url = strip(split(line, ":")[2])
             push!(sitemap_urls, url)
         end
-
     end
     return rules, sitemap_urls
 end
 
-
 """
     check_robots_txt(user_agent::AbstractString, url::AbstractString)
 
@@ -99,14 +98,12 @@ end
 Extract the base url
 """
 function get_base_url(url::AbstractString)
-
     parsed_url = URIs.URI(url)
     base_url = string(parsed_url.scheme, "://", parsed_url.host,
         parsed_url.port != nothing ? "" * string(parsed_url.port) : "", parsed_url.path)
     return base_url
 end
 
-
 """
     process_hostname(url::AbstractString)
 
@@ -118,7 +115,6 @@ function process_hostname(url::AbstractString)
     return hostname
 end
 
-
 """
     process_hostname(url::AbstractString, hostname_dict::Dict{AbstractString,Vector{AbstractString}})
 
@@ -128,7 +124,8 @@ Add `url` to its hostname in `hostname_dict`
 - `url`: URL string
 - `hostname_dict`: Dict with key being hostname and value being a vector of URLs
 """
-function process_hostname!(url::AbstractString, hostname_dict::Dict{AbstractString,Vector{AbstractString}})
+function process_hostname!(
+        url::AbstractString, hostname_dict::Dict{AbstractString, Vector{AbstractString}})
     hostname = process_hostname(url)
 
     # Add the URL to the dictionary under its hostname
@@ -139,17 +136,15 @@ function process_hostname!(url::AbstractString, hostname_dict::Dict{AbstractStri
     end
 end
 
-
 """
     crawl(input_urls::Vector{<:AbstractString})
 
 Crawl on the input URLs and return a `hostname_url_dict` which is a dictionary with key being hostnames and the values being the URLs
 """
 function crawl(input_urls::Vector{<:AbstractString})
-
     url_queue = Vector{AbstractString}(input_urls)
     visited_url_set = Set{AbstractString}()
-    hostname_url_dict = Dict{AbstractString,Vector{AbstractString}}()
+    hostname_url_dict = Dict{AbstractString, Vector{AbstractString}}()
     sitemap_urls = Vector{AbstractString}()
 
     # TODO: Add parallel processing for URLs
@@ -174,5 +169,4 @@ function crawl(input_urls::Vector{<:AbstractString})
     end
 
     return hostname_url_dict, visited_url_set
-
 end
diff --git a/src/extract_package_name.jl b/src/extract_package_name.jl
new file mode 100644
index 0000000..525cecf
--- /dev/null
+++ b/src/extract_package_name.jl
@@ -0,0 +1,162 @@
+"""
+    clean_url(url::String)
+
+Strip URL of any http:// ot https:// or www. prefixes 
+"""
+function clean_url(url::String)
+    # Remove http://, https://, www., or wwws.
+    cleaned_url = replace(url, r"^https?://(www\d?\.)?" => "")
+    return cleaned_url
+end
+
+"""
+    base_url_segment(url::String)
+
+Return the base url and first path segment if all the other checks fail
+"""
+function base_url_segment(url::String)
+    # Clean the URL from unwanted prefixes
+    cleaned_url = clean_url(url)
+
+    # Parse the cleaned URL
+    uri = URI("https://" * cleaned_url)  # Add https:// to ensure correct parsing
+
+    # Extract the base URL (host)
+    base_url = replace(uri.host, r"^www\." => "")
+
+    # Extract the first path segment
+    path_segments = split(uri.path, "/"; keepempty = false)
+
+    if !isempty(path_segments)
+        first_segment = path_segments[1]
+        return "$base_url/$first_segment"
+    else
+        return base_url
+    end
+end
+
+"""
+    url_package_name(url::AbstractString)
+
+Return the text if the URL itself contains the package name with ".jl" or "_jl" suffixes
+"""
+function url_package_name(url::AbstractString)
+    if occursin(r"\.jl", url) || occursin(r"_jl", url)
+        package_name = match(r"[\/]([^\/]+(?:\.jl|_jl))", url)
+        return package_name.captures[1]
+    end
+    return ""
+end
+
+"""
+    get_base_url(url::AbstractString)
+
+Extract the base url
+"""
+function get_base_url(url::AbstractString)
+    parsed_url = URIs.URI(url)
+    base_url = string(parsed_url.scheme, "://", parsed_url.host,
+        parsed_url.port != nothing ? ":" * string(parsed_url.port) : "", parsed_url.path)
+    return base_url
+end
+
+"""
+    nav_bar(url::AbstractString)
+
+Julia doc websites tend to have the package name under ".docs-package-name" class in the HTML tree
+"""
+function nav_bar(url::AbstractString)
+    base_url = get_base_url(url)
+    fetched_content = HTTP.get(base_url)
+    parsed = Gumbo.parsehtml(String(fetched_content.body))
+    content_candidates = [el
+                          for el in AbstractTrees.PreOrderDFS(parsed.root)
+                          if el isa HTMLElement]
+    content_by_class = filter(
+        el -> getattr(el, "class", nothing) in ["docs-package-name"], content_candidates)
+    if (!isempty(content_by_class))
+        parsed_blocks = Vector{Dict{String, Any}}([Dict("Source" => base_url)])
+        heading_hierarchy = Dict{Symbol, Any}()
+        process_node!(only(content_by_class), heading_hierarchy, parsed_blocks)
+        package_name = parsed_blocks[2]["text"]
+        return package_name
+    end
+    return ""
+end
+
+"""
+    text_before_version(url::AbstractString)
+
+Return text before "stable" or "dev" or any version in URL. It is generally observed that doc websites have package names before their versions 
+"""
+function text_before_version(url::AbstractString)
+    language_prefixes = [
+        "/en/", "/es/", "/fr/", "/de/", "/it/", "/pt/", "/ru/", "/zh/", "/ja/", "/ko/"]
+    contains_prefix = any(occursin(prefix, url) for prefix in language_prefixes)
+    if contains_prefix
+        pattern = r"/([^/]+)/([^/]+)/(?:stable|dev|latest|v\d+(\.\d+)*)(?:/|$)"
+    else
+        pattern = r"/([^/]+)/(?:stable|dev|latest|v\d+(\.\d+)*)"
+    end
+    package_name = match(pattern, url)
+    if package_name !== nothing
+        return package_name.captures[1]
+    end
+    return ""
+end
+
+"""
+    docs_in_url(url::AbstractString)
+
+If the base url is in the form docs.package_name.domain_extension, then return the middle word i.e., package_name 
+"""
+function docs_in_url(url::AbstractString)
+    cleaned_url = clean_url(url)
+
+    # Parse the cleaned URL
+    uri = URI("https://" * cleaned_url)  # Add https:// to ensure correct parsing
+
+    # Extract the base URL (host)
+    base_url = replace(uri.host, r"^www\." => "")
+    pattern = r"docs\.([^.]+)\.(org|com|ai|net|io|co|tech)"
+    m = match(pattern, base_url)
+    if m !== nothing
+        return m.captures[1]
+    end
+    return ""
+end
+
+"""
+    get_package_name(url::AbstractString)
+
+Return name of the package through the package URL  
+"""
+function get_package_name(url::AbstractString)
+
+    # try 1: look for package name in URL 
+    package_name = url_package_name(url)
+    if (!isempty(package_name))
+        return package_name
+    end
+
+    # try 2: look for package name in nav bar
+    package_name = nav_bar(url)
+    if (!isempty(package_name))
+        return package_name
+    end
+
+    # try 3: if the base url is in the form docs.package_name.domain_extension
+    package_name = docs_in_url(url)
+    if (!isempty(package_name))
+        return package_name
+    end
+
+    # try 4: get text before "stable" or "dev" or any version in URL
+    package_name = text_before_version(url)
+    if (!isempty(package_name))
+        return package_name
+    end
+
+    # fallback: return base URL with first path segment
+    return base_url_segment(url)
+end
diff --git a/src/extract_urls.jl b/src/extract_urls.jl
index d5e8fcf..d750f34 100644
--- a/src/extract_urls.jl
+++ b/src/extract_urls.jl
@@ -141,4 +141,4 @@ function get_urls!(url::AbstractString, url_queue::Vector{<:AbstractString})
     else
         find_urls_html!(url, parsed.root, url_queue)
     end
-end
\ No newline at end of file
+end
diff --git a/src/make_knowledge_packs.jl b/src/make_knowledge_packs.jl
index 291a9c7..5d56ff8 100644
--- a/src/make_knowledge_packs.jl
+++ b/src/make_knowledge_packs.jl
@@ -24,8 +24,8 @@ function create_output_folders(knowledge_pack_path::String)
 end
 
 """
-    make_chunks(hostname_url_dict::Dict{AbstractString,Vector{AbstractString}}, knowledge_pack_path::String; max_chunk_size::Int=MAX_CHUNK_SIZE,
-        min_chunk_size::Int=MIN_CHUNK_SIZE)
+    make_chunks(hostname_url_dict::Dict{AbstractString,Vector{AbstractString}}, knowledge_pack_path::String; 
+        max_chunk_size::Int=MAX_CHUNK_SIZE, min_chunk_size::Int=MIN_CHUNK_SIZE)
 
 Parse URLs from hostname_url_dict and save the chunks
 
@@ -44,7 +44,8 @@ function make_chunks(hostname_url_dict::Dict{AbstractString, Vector{AbstractStri
         output_sources = Vector{String}()
         for url in urls
             try
-                chunks, sources = process_paths(url; max_chunk_size, min_chunk_size)
+                chunks, sources = process_paths(
+                    url; max_chunk_size, min_chunk_size)
                 append!(output_chunks, chunks)
                 append!(output_sources, sources)
             catch
@@ -85,16 +86,20 @@ function l2_norm_columns(vect::AbstractVector)
 end
 
 """
-    generate_embeddings(knowledge_pack_path::String; model::AbstractString=MODEL, embedding_size::Int=EMBEDDING_SIZE)
+    generate_embeddings(knowledge_pack_path::String; model::AbstractString=MODEL, 
+        embedding_size::Int=EMBEDDING_SIZE, custom_metadata::AbstractString)
 
 Deserialize chunks and sources to generate embeddings 
 
 # Arguments
 - model: Embedding model
 - embedding_size: Embedding dimensions
+- custom_metadata: Custom metadata like ecosystem name if required
 """
-function generate_embeddings(knowledge_pack_path::String; model::AbstractString = MODEL,
-        embedding_size::Int = EMBEDDING_SIZE)
+function generate_embeddings(
+        knowledge_pack_path::String; max_chunk_size::Int = MAX_CHUNK_SIZE,
+        model::AbstractString = MODEL,
+        embedding_size::Int = EMBEDDING_SIZE, custom_metadata::AbstractString)
     embedder = RT.BatchEmbedder()
     entries = readdir(knowledge_pack_path)
     # Initialize a dictionary to group files by hostname and chunk size
@@ -114,31 +119,31 @@ function generate_embeddings(knowledge_pack_path::String; model::AbstractString
 
         if match_chunks !== nothing
             hostname = match_chunks.captures[1]
-            chunk_size = parse(Int, match_chunks.captures[2])
+            max_chunk_size = parse(Int, match_chunks.captures[2])
             if !haskey(hostname_files, hostname)
                 hostname_files[hostname] = Dict{Int, Dict{String, String}}()
             end
-            if !haskey(hostname_files[hostname], chunk_size)
-                hostname_files[hostname][chunk_size] = Dict{String, String}()
+            if !haskey(hostname_files[hostname], max_chunk_size)
+                hostname_files[hostname][max_chunk_size] = Dict{String, String}()
             end
-            hostname_files[hostname][chunk_size]["chunks"] = joinpath(
+            hostname_files[hostname][max_chunk_size]["chunks"] = joinpath(
                 knowledge_pack_path, file)
         elseif match_sources !== nothing
             hostname = match_sources.captures[1]
-            chunk_size = parse(Int, match_sources.captures[2])
+            max_chunk_size = parse(Int, match_sources.captures[2])
             if !haskey(hostname_files, hostname)
                 hostname_files[hostname] = Dict{Int, Dict{String, String}}()
             end
-            if !haskey(hostname_files[hostname], chunk_size)
-                hostname_files[hostname][chunk_size] = Dict{String, String}()
+            if !haskey(hostname_files[hostname], max_chunk_size)
+                hostname_files[hostname][max_chunk_size] = Dict{String, String}()
             end
-            hostname_files[hostname][chunk_size]["sources"] = joinpath(
+            hostname_files[hostname][max_chunk_size]["sources"] = joinpath(
                 knowledge_pack_path, file)
         end
     end
     # Process each pair of files
     for (hostname, chunk_files) in hostname_files
-        for (chunk_size, files) in chunk_files
+        for (max_chunk_size, files) in chunk_files
             if haskey(files, "chunks") && haskey(files, "sources")
                 chunks_file = files["chunks"]
                 sources_file = files["sources"]
@@ -148,17 +153,31 @@ function generate_embeddings(knowledge_pack_path::String; model::AbstractString
                 full_embeddings = RT.get_embeddings(
                     embedder, chunks; model, verbose = false, cost_tracker)
                 @info "Created embeddings for $hostname. Cost: \$$(round(cost_tracker[], digits=3))"
+
+                trunc = embedding_size < EMBEDDING_SIZE ? 1 : 0
                 fn_output = joinpath(knowledge_pack_path, "packs",
-                    "$hostname-textembedding3large-0-Float32__v1.0.tar.gz")
+                    "$hostname-$model-$trunc-Float32__v1.0.tar.gz")
                 fn_temp = joinpath(knowledge_pack_path, "packs",
-                    "$hostname-textembedding3large-0-Float32__v1.0.hdf5")
+                    "$hostname-$model-$trunc-Float32__v1.0.hdf5")
+
                 h5open(fn_temp, "w") do file
                     file["chunks"] = chunks
                     file["sources"] = sources
                     file["embeddings"] = full_embeddings[1:embedding_size, :] |>
                                          l2_norm_columns |> x -> map(>(0), x)
                     file["type"] = "ChunkIndex"
-                    # file["metadata"] = "$hostname ecosystem docstrings, chunk size $chunk_size, downloaded on 20240330, contains: Makie.jl, AlgebraOfGraphics.jl, GeoMakie.jl, GraphMakie.jl, MakieThemes.jl, TopoPlots.jl, Tyler.jl"
+
+                    package_url_dict = Dict{String, Vector{String}}()
+                    package_url_dict = urls_for_metadata(sources)
+
+                    metadata = Dict(
+                        :embedded_dt => Dates.today(),
+                        :custom_metadata => custom_metadata, :max_chunk_size => max_chunk_size,
+                        :embedding_size => embedding_size, :model => model,
+                        :packages => package_url_dict)
+
+                    metadata_json = JSON.json(metadata)
+                    file["metadata"] = metadata_json
                 end
 
                 command = `tar -cvzf $fn_output -C $(dirname(fn_temp)) $(basename(fn_temp))`
@@ -166,7 +185,7 @@ function generate_embeddings(knowledge_pack_path::String; model::AbstractString
                 report_artifact(fn_output)
 
             else
-                @warn "Missing pair for hostname: $hostname, chunk size: $chunk_size"
+                @warn "Missing pair for hostname: $hostname, max chunk size: $max_chunk_size"
             end
         end
     end
@@ -174,7 +193,8 @@ end
 
 """
     make_knowledge_packs(crawlable_urls::Vector{<:AbstractString}=String[]; single_urls::Vector{<:AbstractString}=String[],
-        max_chunk_size::Int=MAX_CHUNK_SIZE, min_chunk_size::Int=MIN_CHUNK_SIZE, model::AbstractString=MODEL, embedding_size::Int=EMBEDDING_SIZE)
+        max_chunk_size::Int=MAX_CHUNK_SIZE, min_chunk_size::Int=MIN_CHUNK_SIZE, model::AbstractString=MODEL, embedding_size::Int=EMBEDDING_SIZE, 
+        custom_metadata::AbstractString)
 
 Entry point to crawl, parse and generate embeddings
 
@@ -185,11 +205,12 @@ Entry point to crawl, parse and generate embeddings
 - min_chunk_size: Minimum chunk size
 - model: Embedding model
 - embedding_size: Embedding dimensions
+- custom_metadata: Custom metadata like ecosystem name if required
 """
 function make_knowledge_packs(crawlable_urls::Vector{<:AbstractString} = String[];
         single_urls::Vector{<:AbstractString} = String[],
         max_chunk_size::Int = MAX_CHUNK_SIZE, min_chunk_size::Int = MIN_CHUNK_SIZE,
-        model::AbstractString = MODEL, embedding_size::Int = EMBEDDING_SIZE)
+        model::AbstractString = MODEL, embedding_size::Int = EMBEDDING_SIZE, custom_metadata::AbstractString = "")
     if isempty(crawlable_urls) && isempty(single_urls)
         error("At least one of `input_urls` or `single_pages` must be provided.")
     end
@@ -217,6 +238,8 @@ function make_knowledge_packs(crawlable_urls::Vector{<:AbstractString} = String[
     end
     knowledge_pack_path = joinpath(@__DIR__, "..", "knowledge_packs")
     create_output_folders(knowledge_pack_path)
-    make_chunks(hostname_url_dict, knowledge_pack_path; max_chunk_size, min_chunk_size)
-    generate_embeddings(knowledge_pack_path; model, embedding_size)
+    make_chunks(
+        hostname_url_dict, knowledge_pack_path; max_chunk_size, min_chunk_size)
+    generate_embeddings(
+        knowledge_pack_path; max_chunk_size, model, embedding_size, custom_metadata)
 end
diff --git a/src/parser.jl b/src/parser.jl
index def1a17..2de7035 100644
--- a/src/parser.jl
+++ b/src/parser.jl
@@ -340,9 +340,6 @@ end
     parse_url(url::AbstractString)
 
 Initiator and main function to parse HTML from url. Return a Vector of Dict containing Heading/Text/Code along with a Dict of respective metadata
-
-# Arguments
-- `url`: URL string to parse
 """
 function parse_url_to_blocks(url::AbstractString)
 
@@ -356,7 +353,7 @@ function parse_url_to_blocks(url::AbstractString)
         # Getting title of the document 
         # title = [el
         #          for el in AbstractTrees.PreOrderDFS(r_parsed.root)
-        #          if el isa HTMLElement && tag(el) == :title] .|> text |> Base.Fix2(join, " / ")
+        #          if el isa HTMLElement && tag(el) == :title] .|> text |> Base.Fix2(join, " / ")    
         parsed_blocks = Vector{Dict{String, Any}}([Dict("Source" => base_url)])
         heading_hierarchy = Dict{Symbol, Any}()
         process_node!(get_html_content(parsed.root), heading_hierarchy, parsed_blocks)
diff --git a/src/preparation.jl b/src/preparation.jl
index 9979155..8736050 100644
--- a/src/preparation.jl
+++ b/src/preparation.jl
@@ -10,22 +10,21 @@ get_header_path(d)
 # Output: "Axis/Attributes/yzoomkey"
 ```
 """
-function get_header_path(d::Dict{String,Any})
-    metadata = get(d, "metadata", Dict{Any,Any}())
+function get_header_path(d::Dict{String, Any})
+    metadata = get(d, "metadata", Dict{Any, Any}())
     isempty(metadata) && return nothing
     keys_ = [:h1, :h2, :h3]
     vals = get.(Ref(metadata), keys_, "") |> x -> filter(!isempty, x) |> x -> join(x, "/")
     isempty(vals) ? nothing : vals
 end
 
-
-
 """
     roll_up_chunks(parsed_blocks::Vector{Dict{String,Any}}, url::AbstractString; separator::String="<SEP>")
 
 Roll-up chunks (that have the same header!), so we can split them later by <SEP> to get the desired length
 """
-function roll_up_chunks(parsed_blocks::Vector{Dict{String,Any}}, url::AbstractString; separator::String="<SEP>")
+function roll_up_chunks(parsed_blocks::Vector{Dict{String, Any}},
+        url::AbstractString; separator::String = "<SEP>")
     docs = String[]
     io = IOBuffer()
     last_header = nothing
@@ -57,7 +56,6 @@ function roll_up_chunks(parsed_blocks::Vector{Dict{String,Any}}, url::AbstractSt
     return docs, sources
 end
 
-
 struct DocParserChunker <: RT.AbstractChunker end
 
 """
@@ -74,9 +72,9 @@ and splits them by separators to get the desired length.
 - separators: Chunk separators
 - max_chunk_size Maximum chunk size
 """
-function RT.get_chunks(chunker::DocParserChunker, url::AbstractString;
-    verbose::Bool=true, separators=["\n\n", ". ", "\n", " "], max_chunk_size::Int=MAX_CHUNK_SIZE)
-
+function RT.get_chunks(
+        chunker::DocParserChunker, url::AbstractString;
+        verbose::Bool = true, separators = ["\n\n", ". ", "\n", " "], max_chunk_size::Int = MAX_CHUNK_SIZE)
     SEP = "<SEP>"
     sources = AbstractVector{<:AbstractString}
     output_chunks = Vector{SubString{String}}()
@@ -86,14 +84,14 @@ function RT.get_chunks(chunker::DocParserChunker, url::AbstractString;
 
     parsed_blocks = parse_url_to_blocks(url)
     ## Roll up to the same header
-    docs_, sources_ = roll_up_chunks(parsed_blocks, url; separator=SEP)
+    docs_, sources_ = roll_up_chunks(parsed_blocks, url; separator = SEP)
 
     ## roll up chunks by SEP splitter, then remove it later
     for (doc, src) in zip(docs_, sources_)
         ## roll up chunks by SEP splitter, then remove it later
-        doc_chunks = PT.recursive_splitter(doc, [SEP, separators...]; max_length=max_chunk_size) .|>
+        doc_chunks = PT.recursive_splitter(
+            doc, [SEP, separators...]; max_length = max_chunk_size) .|>
                      x -> replace(x, SEP => " ") .|> strip |> x -> filter(!isempty, x)
-        chunk_lengths = length.(doc_chunks)
         # skip if no chunks found
         isempty(doc_chunks) && continue
         append!(output_chunks, doc_chunks)
@@ -102,15 +100,14 @@ function RT.get_chunks(chunker::DocParserChunker, url::AbstractString;
     return output_chunks, output_sources
 end
 
-
-
 """
     process_paths(url::AbstractString; max_chunk_size::Int=MAX_CHUNK_SIZE, min_chunk_size::Int=MIN_CHUNK_SIZE)
 
 Process folders provided in `paths`. In each, take all HTML files, scrape them, chunk them and postprocess them.
 """
-function process_paths(url::AbstractString; max_chunk_size::Int=MAX_CHUNK_SIZE, min_chunk_size::Int=MIN_CHUNK_SIZE)
-
+function process_paths(url::AbstractString;
+        max_chunk_size::Int = MAX_CHUNK_SIZE,
+        min_chunk_size::Int = MIN_CHUNK_SIZE)
     output_chunks = Vector{SubString{String}}()
     output_sources = Vector{String}()
 
@@ -119,9 +116,9 @@ function process_paths(url::AbstractString; max_chunk_size::Int=MAX_CHUNK_SIZE,
     append!(output_chunks, chunks)
     append!(output_sources, sources)
 
-
     @info "Scraping done: $(length(output_chunks)) chunks"
-    output_chunks, output_sources = postprocess_chunks(output_chunks, output_sources; min_chunk_size, skip_code=true)
+    output_chunks, output_sources = postprocess_chunks(
+        output_chunks, output_sources; min_chunk_size, skip_code = true)
 
     return output_chunks, output_sources
 end
diff --git a/src/user_preferences.jl b/src/user_preferences.jl
index 98794c6..00c1a2f 100644
--- a/src/user_preferences.jl
+++ b/src/user_preferences.jl
@@ -1,4 +1,4 @@
 global MIN_CHUNK_SIZE = 40
-global MAX_CHUNK_SIZE = 256
+global MAX_CHUNK_SIZE = 384
 global MODEL = "text-embedding-3-large"
-global EMBEDDING_SIZE = 1024
\ No newline at end of file
+global EMBEDDING_SIZE = 3072
diff --git a/src/utils.jl b/src/utils.jl
index e8dc014..dfbc17c 100644
--- a/src/utils.jl
+++ b/src/utils.jl
@@ -30,21 +30,21 @@ end
 
 Remove chunks that are duplicated in the input list of chunks and their corresponding sources.
 """
-function remove_duplicates(chunks::AbstractVector{<:AbstractString}, sources::AbstractVector{<:AbstractString})
+function remove_duplicates(
+        chunks::AbstractVector{<:AbstractString}, sources::AbstractVector{<:AbstractString})
     idxs = find_duplicates(chunks)
     return chunks[.!idxs], sources[.!idxs]
 end
 
-
 """
     remove_short_chunks(chunks::AbstractVector{<:AbstractString}, sources::AbstractVector{<:AbstractString};
         min_chunk_size::Int=MIN_CHUNK_SIZE, skip_code::Bool=true)
 
 Remove chunks that are shorter than a specified length (`min_length`) from the input list of chunks and their corresponding sources.
 """
-function remove_short_chunks(chunks::AbstractVector{<:AbstractString}, sources::AbstractVector{<:AbstractString};
-    min_chunk_size::Int=MIN_CHUNK_SIZE, skip_code::Bool=true)
-
+function remove_short_chunks(
+        chunks::AbstractVector{<:AbstractString}, sources::AbstractVector{<:AbstractString};
+        min_chunk_size::Int = MIN_CHUNK_SIZE, skip_code::Bool = true)
     chunk_lengths = length.(chunks)
     idx = if skip_code
         ## Keep short chunks if they contain code (might be combined with some preceding/succeeding text)
@@ -56,17 +56,15 @@ function remove_short_chunks(chunks::AbstractVector{<:AbstractString}, sources::
     return chunks[idx], sources[idx]
 end
 
-
-function replace_local_paths(sources::AbstractVector{<:AbstractString}, paths::AbstractVector{<:AbstractString}, websites::AbstractVector{<:AbstractString})
-    @assert length(paths) == length(websites) "Length of `paths` must match length of `websites`"
+function replace_local_paths(
+        sources::AbstractVector{<:AbstractString}, paths::AbstractVector{<:AbstractString},
+        websites::AbstractVector{<:AbstractString})
+    @assert length(paths)==length(websites) "Length of `paths` must match length of `websites`"
     replacement_pairs = paths .=> websites
     output = map(x -> replace(x, replacement_pairs...), sources)
     return output
 end
 
-
-
-
 """
     function postprocess_chunks(chunks::AbstractVector{<:AbstractString}, sources::AbstractVector{<:AbstractString};
         min_chunk_size::Int=MIN_CHUNK_SIZE, skip_code::Bool=true, paths::Union{Nothing,AbstractVector{<:AbstractString}}=nothing,
@@ -74,9 +72,11 @@ end
 
 Post-process the input list of chunks and their corresponding sources by removing short chunks and duplicates.
 """
-function postprocess_chunks(chunks::AbstractVector{<:AbstractString}, sources::AbstractVector{<:AbstractString};
-    min_chunk_size::Int=MIN_CHUNK_SIZE, skip_code::Bool=true, paths::Union{Nothing,AbstractVector{<:AbstractString}}=nothing,
-    websites::Union{Nothing,AbstractVector{<:AbstractString}}=nothing)
+function postprocess_chunks(
+        chunks::AbstractVector{<:AbstractString}, sources::AbstractVector{<:AbstractString};
+        min_chunk_size::Int = MIN_CHUNK_SIZE, skip_code::Bool = true,
+        paths::Union{Nothing, AbstractVector{<:AbstractString}} = nothing,
+        websites::Union{Nothing, AbstractVector{<:AbstractString}} = nothing)
     len_ = length(chunks)
     chunks, sources = remove_short_chunks(chunks, sources; min_chunk_size, skip_code)
     @info "Removed $(len_ - length(chunks)) short chunks"
@@ -99,7 +99,8 @@ end
 
 Remove chunks and sources corresponding to URLs starting with `prefix_urls` 
 """
-function remove_urls_from_index(index_path::AbstractString, prefix_urls=Vector{<:AbstractString})
+function remove_urls_from_index(
+        index_path::AbstractString, prefix_urls = Vector{<:AbstractString})
     @assert endswith(file_path, ".hdf5") "Provided file path must end with `.hdf5` (see HDF5.jl)."
 
     h5open(index_path, "r+") do orig_file
@@ -119,4 +120,43 @@ function remove_urls_from_index(index_path::AbstractString, prefix_urls=Vector{<
         write(file["chunks"], chunks)
         write(file["embeddings"], embeddings)
     end
-end
\ No newline at end of file
+end
+
+"""
+    urls_for_metadata(sources::Vector{String})
+
+Return a Dict of package names with their associated URLs
+Note: Due to their large number, URLs are stripped down to the package name; Package subpaths are not included in metadata.
+"""
+function urls_for_metadata(sources::Vector{String})
+    urls = [split(source, " -")[1] for source in sources]
+    pattern = r"(/(?:stable|dev|latest|v\d+(?:\.\d+)*))"
+    cleaned_urls = [endswith(String(url), "/") ? String(url)[1:(end - 1)] : String(url)
+                    for url in urls]
+    unique_urls = unique(cleaned_urls)
+    package_names = Vector{String}()
+
+    for url in unique_urls
+        push!(package_names, get_package_name(String(url)))
+    end
+
+    cleaned_urls = [match(pattern, url) !== nothing ? first(split(url, pattern)) : url
+                    for url in unique_urls]
+
+    zipped = zip(cleaned_urls, package_names) |> collect
+    unique_pairs = unique(zipped)
+    unique_urls = [pair[1] for pair in unique_pairs]
+    unique_package_names = [pair[2] for pair in unique_pairs]
+
+    package_url_dict = Dict{String, Vector{String}}()
+    for (url, package_name) in zip(unique_urls, unique_package_names)
+        if haskey(package_url_dict, package_name)
+            # If the package_name is already a key, append the url to the existing array
+            push!(package_url_dict[package_name], url)
+        else
+            # Otherwise, create a new entry with the package_name and the url
+            package_url_dict[package_name] = [url]
+        end
+    end
+    return package_url_dict
+end
diff --git a/test/crawl.jl b/test/crawl.jl
new file mode 100644
index 0000000..6b00ca4
--- /dev/null
+++ b/test/crawl.jl
@@ -0,0 +1,7 @@
+using DocsScraper: crawl
+
+@testset "crawl" begin
+    urls = Vector{AbstractString}(["https://docs.julialang.org/en/v1/"])
+    hostname_url_dict = crawl(urls)
+    @test length(hostname_url_dict) > 0
+end
diff --git a/test/make_knowledge_packs.jl b/test/make_knowledge_packs.jl
new file mode 100644
index 0000000..5690725
--- /dev/null
+++ b/test/make_knowledge_packs.jl
@@ -0,0 +1,8 @@
+using DocsScraper: process_paths
+
+@testset "overall test" begin
+    url = "https://docs.julialang.org/en/v1/"
+    chunks, sources = process_paths(url)
+    @test length(chunks) > 0 && length(sources) > 0 && chunks[1] != nothing &&
+          sources[1] != nothing
+end
diff --git a/test/parser.jl b/test/parser.jl
new file mode 100644
index 0000000..0faeb04
--- /dev/null
+++ b/test/parser.jl
@@ -0,0 +1,11 @@
+using DocsScraper: parse_url_to_blocks, roll_up_chunks
+
+@testset "parse & roll_up" begin
+    url = "https://docs.julialang.org/en/v1/"
+    parsed_blocks = parse_url_to_blocks(url)
+    @test length(parsed_blocks) > 0
+    SEP = "<SEP>"
+    docs_, sources_ = roll_up_chunks(parsed_blocks, url; separator = SEP)
+    @test length(docs_) > 0 && length(sources_) > 0 && docs_[1] != nothing &&
+          sources_[1] != nothing
+end
diff --git a/test/runtests.jl b/test/runtests.jl
index 4b4a92c..6e1e7e8 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -1,51 +1,13 @@
-
+using DocsScraper
 using Test
-using HTTP, Gumbo, AbstractTrees, URIs
-using Gumbo: HTMLDocument, HTMLElement
-using EzXML
-using PromptingTools
-const PT = PromptingTools
-const RT = PromptingTools.Experimental.RAGTools
-using LinearAlgebra, Unicode, SparseArrays
-using HDF5
-using Tar
-using Inflate
-using SHA
-using Serialization, URIs
-
-include(joinpath("..", "src", "crawl.jl"))
-include(joinpath("..", "src", "extract_urls.jl"))
-include(joinpath("..", "src", "parser.jl"))
-include(joinpath("..", "src", "preparation.jl"))
-include(joinpath("..", "src", "user_preferences.jl"))
-include(joinpath("..", "src", "utils.jl"))
-
-urls = Vector{AbstractString}(["https://docs.julialang.org/en/v1/"])
-url = urls[1]
-queue = Vector{AbstractString}()
-
-@testset "HTTP" begin
-    @test HTTP.get(url) != nothing
-    result, sitemap_queue = check_robots_txt("*", url)
-    @test result == true
-end
-
-@testset "crawl" begin
-    hostname_url_dict = crawl(urls)
-    @test length(hostname_url_dict) > 0
-end
+using Aqua
 
-@testset "parse & roll_up" begin
-    parsed_blocks = parse_url_to_blocks(url)
-    @test length(parsed_blocks) > 0
-    SEP = "<SEP>"
-    docs_, sources_ = roll_up_chunks(parsed_blocks, url; separator = SEP)
-    @test length(docs_) > 0 && length(sources_) > 0 && docs_[1] != nothing &&
-          sources_[1] != nothing
-end
+@testset "DocsScraper.jl" begin
+    @testset "Code quality (Aqua.jl)" begin
+        Aqua.test_all(DocsScraper; persistent_tasks = false)
+    end
 
-@testset "overall test" begin
-    chunks, sources = process_paths(url)
-    @test length(chunks) > 0 && length(sources) > 0 && chunks[1] != nothing &&
-          sources[1] != nothing
+    include("crawl.jl")
+    include("parser.jl")
+    include("make_knowledge_packs.jl")
 end
diff --git a/test/utils.jl b/test/utils.jl
new file mode 100644
index 0000000..fbe338a
--- /dev/null
+++ b/test/utils.jl
@@ -0,0 +1,10 @@
+using DocsScraper: parse_url_to_blocks, roll_up_chunks
+
+@testset "parse & roll_up" begin
+    parsed_blocks = parse_url_to_blocks(url)
+    @test length(parsed_blocks) > 0
+    SEP = "<SEP>"
+    docs_, sources_ = roll_up_chunks(parsed_blocks, url; separator = SEP)
+    @test length(docs_) > 0 && length(sources_) > 0 && docs_[1] != nothing &&
+          sources_[1] != nothing
+end

From 965873abf5b48f7bac6e99036d6b23a79dd54985 Mon Sep 17 00:00:00 2001
From: Shreyas Agrawal <48771895+splendidbug@users.noreply.github.com>
Date: Thu, 15 Aug 2024 19:33:35 -0700
Subject: [PATCH 6/7] create a single index file

---
 Project.toml                |   6 +-
 src/DocsScraper.jl          |   1 +
 src/make_knowledge_packs.jl | 140 +++++++++++++++++++++---------------
 3 files changed, 87 insertions(+), 60 deletions(-)

diff --git a/Project.toml b/Project.toml
index 1fb77c2..ef5aaa9 100644
--- a/Project.toml
+++ b/Project.toml
@@ -14,6 +14,7 @@ Inflate = "d25df0c9-e2be-5dd7-82c8-3ad0b3e990b9"
 JSON = "682c06a0-de6a-54ab-a142-c8b1cf79cde6"
 LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
 PromptingTools = "670122d1-24a8-4d70-bfce-740807c42192"
+Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
 SHA = "ea8e919c-243c-51af-8825-aaa63cd721ce"
 Serialization = "9e88b42a-f829-5b0c-bbe9-9e923198166b"
 SparseArrays = "2f01184e-e22b-5df5-ae63-d93ebab69eaf"
@@ -30,8 +31,10 @@ Gumbo = "0.8"
 HDF5 = "0.17"
 HTTP = "1.10"
 Inflate = "0.1"
+JSON = "0.21"
 LinearAlgebra = "1"
-PromptingTools = "0.48"
+PromptingTools = "0.49"
+Random = "1"
 SHA = "0.7"
 Serialization = "1"
 SparseArrays = "1"
@@ -40,7 +43,6 @@ Test = "1"
 URIs = "1.5"
 Unicode = "1"
 julia = "1.10"
-JSON = "0.21"
 
 [extras]
 Aqua = "4c88cf16-eb10-579e-8560-4a9242c79595"
diff --git a/src/DocsScraper.jl b/src/DocsScraper.jl
index 7f114d9..0a65d57 100644
--- a/src/DocsScraper.jl
+++ b/src/DocsScraper.jl
@@ -13,6 +13,7 @@ using SHA
 using Serialization, URIs
 using Dates
 using JSON
+using Random
 
 include("parser.jl")
 include("crawl.jl")
diff --git a/src/make_knowledge_packs.jl b/src/make_knowledge_packs.jl
index 5d56ff8..a787edf 100644
--- a/src/make_knowledge_packs.jl
+++ b/src/make_knowledge_packs.jl
@@ -38,7 +38,6 @@ Parse URLs from hostname_url_dict and save the chunks
 function make_chunks(hostname_url_dict::Dict{AbstractString, Vector{AbstractString}},
         knowledge_pack_path::String; max_chunk_size::Int = MAX_CHUNK_SIZE,
         min_chunk_size::Int = MIN_CHUNK_SIZE)
-    SAVE_CHUNKS = true
     for (hostname, urls) in hostname_url_dict
         output_chunks = Vector{SubString{String}}()
         output_sources = Vector{String}()
@@ -52,16 +51,14 @@ function make_chunks(hostname_url_dict::Dict{AbstractString, Vector{AbstractStri
                 @error "error!! check url: $url"
             end
         end
-        if SAVE_CHUNKS
-            serialize(
-                joinpath(knowledge_pack_path,
-                    "$(hostname)-chunks-max-$(max_chunk_size)-min-$(min_chunk_size).jls"),
-                output_chunks)
-            serialize(
-                joinpath(knowledge_pack_path,
-                    "$(hostname)-sources-max-$(max_chunk_size)-min-$(min_chunk_size).jls"),
-                output_sources)
-        end
+        serialize(
+            joinpath(knowledge_pack_path,
+                "$(hostname)-chunks-max-$(max_chunk_size)-min-$(min_chunk_size).jls"),
+            output_chunks)
+        serialize(
+            joinpath(knowledge_pack_path,
+                "$(hostname)-sources-max-$(max_chunk_size)-min-$(min_chunk_size).jls"),
+            output_sources)
     end
 end
 
@@ -87,19 +84,24 @@ end
 
 """
     generate_embeddings(knowledge_pack_path::String; model::AbstractString=MODEL, 
-        embedding_size::Int=EMBEDDING_SIZE, custom_metadata::AbstractString)
+        embedding_size::Int=EMBEDDING_SIZE, custom_metadata::AbstractString,
+        bool_embeddings::Bool = true, index_name::AbstractString = "")
 
 Deserialize chunks and sources to generate embeddings 
+Note: We highly recommend to pass `index_name`. This will be the name of the generated index. Default: date-randomInt
 
 # Arguments
 - model: Embedding model
 - embedding_size: Embedding dimensions
 - custom_metadata: Custom metadata like ecosystem name if required
+- bool_embeddings: If true, embeddings generated will be boolean, Float32 otherwise
+- index_name: Name if the index. Default: date-randomInt
 """
 function generate_embeddings(
         knowledge_pack_path::String; max_chunk_size::Int = MAX_CHUNK_SIZE,
         model::AbstractString = MODEL,
-        embedding_size::Int = EMBEDDING_SIZE, custom_metadata::AbstractString)
+        embedding_size::Int = EMBEDDING_SIZE, custom_metadata::AbstractString,
+        bool_embeddings::Bool = true, index_name::AbstractString = "")
     embedder = RT.BatchEmbedder()
     entries = readdir(knowledge_pack_path)
     # Initialize a dictionary to group files by hostname and chunk size
@@ -109,9 +111,6 @@ function generate_embeddings(
     chunks_pattern = r"^(.*)-chunks-max-(\d+)-min-(\d+)\.jls$"
     sources_pattern = r"^(.*)-sources-max-(\d+)-min-(\d+)\.jls$"
 
-    # chunks_pattern = r"^(.*)-chunks-(\d+)\.jls$"
-    # sources_pattern = r"^(.*)-sources-(\d+)\.jls$"
-
     # Group files by hostname and chunk size
     for file in entries
         match_chunks = match(chunks_pattern, file)
@@ -141,62 +140,83 @@ function generate_embeddings(
                 knowledge_pack_path, file)
         end
     end
-    # Process each pair of files
+
+    chunks = Vector{SubString{String}}()
+    sources = Vector{String}()
+
+    # Add chunks and sources to vectors from each of the scraped file
     for (hostname, chunk_files) in hostname_files
         for (max_chunk_size, files) in chunk_files
             if haskey(files, "chunks") && haskey(files, "sources")
                 chunks_file = files["chunks"]
                 sources_file = files["sources"]
-                chunks = deserialize(chunks_file)
-                sources = deserialize(sources_file)
-                cost_tracker = Threads.Atomic{Float64}(0.0)
-                full_embeddings = RT.get_embeddings(
-                    embedder, chunks; model, verbose = false, cost_tracker)
-                @info "Created embeddings for $hostname. Cost: \$$(round(cost_tracker[], digits=3))"
-
-                trunc = embedding_size < EMBEDDING_SIZE ? 1 : 0
-                fn_output = joinpath(knowledge_pack_path, "packs",
-                    "$hostname-$model-$trunc-Float32__v1.0.tar.gz")
-                fn_temp = joinpath(knowledge_pack_path, "packs",
-                    "$hostname-$model-$trunc-Float32__v1.0.hdf5")
-
-                h5open(fn_temp, "w") do file
-                    file["chunks"] = chunks
-                    file["sources"] = sources
-                    file["embeddings"] = full_embeddings[1:embedding_size, :] |>
-                                         l2_norm_columns |> x -> map(>(0), x)
-                    file["type"] = "ChunkIndex"
-
-                    package_url_dict = Dict{String, Vector{String}}()
-                    package_url_dict = urls_for_metadata(sources)
-
-                    metadata = Dict(
-                        :embedded_dt => Dates.today(),
-                        :custom_metadata => custom_metadata, :max_chunk_size => max_chunk_size,
-                        :embedding_size => embedding_size, :model => model,
-                        :packages => package_url_dict)
-
-                    metadata_json = JSON.json(metadata)
-                    file["metadata"] = metadata_json
-                end
-
-                command = `tar -cvzf $fn_output -C $(dirname(fn_temp)) $(basename(fn_temp))`
-                run(command)
-                report_artifact(fn_output)
-
+                append!(chunks, deserialize(chunks_file))
+                append!(sources, deserialize(sources_file))
             else
                 @warn "Missing pair for hostname: $hostname, max chunk size: $max_chunk_size"
             end
         end
     end
+
+    # Generate embeddings
+    cost_tracker = Threads.Atomic{Float64}(0.0)
+    full_embeddings = RT.get_embeddings(
+        embedder, chunks; model, verbose = false, cost_tracker)
+
+    full_embeddings = full_embeddings[1:embedding_size, :] |>
+                      l2_norm_columns
+
+    if bool_embeddings
+        full_embeddings = map(>(0), full_embeddings)
+    end
+
+    if isempty(index_name)
+        rand_int = rand(1000:100000)
+        date = Dates.today()
+        index_name = "$(date)-$(rand_int)"
+    end
+
+    @info "Created embeddings for $index_name. Cost: \$$(round(cost_tracker[], digits=3))"
+
+    trunc = embedding_size < EMBEDDING_SIZE ? 1 : 0
+    emb_data_type = bool_embeddings ? "Bool" : "Float32"
+
+    fn_output = joinpath(knowledge_pack_path, "packs",
+        "$index_name-$model-$trunc-$(emb_data_type)__v1.0.tar.gz")
+    fn_temp = joinpath(knowledge_pack_path, "packs",
+        "$index_name-$model-$trunc-$(emb_data_type)__v1.0.hdf5")
+
+    h5open(fn_temp, "w") do file
+        file["chunks"] = chunks
+        file["sources"] = sources
+        file["embeddings"] = full_embeddings
+        file["type"] = "ChunkIndex"
+
+        package_url_dict = Dict{String, Vector{String}}()
+        package_url_dict = urls_for_metadata(sources)
+
+        metadata = Dict(
+            :embedded_dt => Dates.today(),
+            :custom_metadata => custom_metadata, :max_chunk_size => max_chunk_size,
+            :embedding_size => embedding_size, :model => model,
+            :packages => package_url_dict)
+
+        metadata_json = JSON.json(metadata)
+        file["metadata"] = metadata_json
+    end
+
+    command = `tar -cvzf $fn_output -C $(dirname(fn_temp)) $(basename(fn_temp))`
+    run(command)
+    report_artifact(fn_output)
 end
 
 """
     make_knowledge_packs(crawlable_urls::Vector{<:AbstractString}=String[]; single_urls::Vector{<:AbstractString}=String[],
         max_chunk_size::Int=MAX_CHUNK_SIZE, min_chunk_size::Int=MIN_CHUNK_SIZE, model::AbstractString=MODEL, embedding_size::Int=EMBEDDING_SIZE, 
-        custom_metadata::AbstractString)
+        custom_metadata::AbstractString, bool_embeddings::Bool = true, index_name::AbstractString = "")
 
-Entry point to crawl, parse and generate embeddings
+Entry point to crawl, parse and generate embeddings.
+Note: We highly recommend to pass `index_name`. This will be the name of the generated index. Default: date-randomInt
 
 # Arguments
 - crawlable_urls: URLs that should be crawled to find more links
@@ -206,11 +226,14 @@ Entry point to crawl, parse and generate embeddings
 - model: Embedding model
 - embedding_size: Embedding dimensions
 - custom_metadata: Custom metadata like ecosystem name if required
+- bool_embeddings: If true, embeddings generated will be boolean, Float32 otherwise
+- index_name: Name if the index. Default: date-randomInt
 """
 function make_knowledge_packs(crawlable_urls::Vector{<:AbstractString} = String[];
         single_urls::Vector{<:AbstractString} = String[],
         max_chunk_size::Int = MAX_CHUNK_SIZE, min_chunk_size::Int = MIN_CHUNK_SIZE,
-        model::AbstractString = MODEL, embedding_size::Int = EMBEDDING_SIZE, custom_metadata::AbstractString = "")
+        model::AbstractString = MODEL, embedding_size::Int = EMBEDDING_SIZE, custom_metadata::AbstractString = "",
+        bool_embeddings::Bool = true, index_name::AbstractString = "")
     if isempty(crawlable_urls) && isempty(single_urls)
         error("At least one of `input_urls` or `single_pages` must be provided.")
     end
@@ -241,5 +264,6 @@ function make_knowledge_packs(crawlable_urls::Vector{<:AbstractString} = String[
     make_chunks(
         hostname_url_dict, knowledge_pack_path; max_chunk_size, min_chunk_size)
     generate_embeddings(
-        knowledge_pack_path; max_chunk_size, model, embedding_size, custom_metadata)
+        knowledge_pack_path; max_chunk_size, model, embedding_size,
+        custom_metadata, bool_embeddings, index_name)
 end

From b2c629ab64599da6be4e0f08b46d2e5567fbd8a6 Mon Sep 17 00:00:00 2001
From: Shreyas Shirish Agrawal <48771895+splendidbug@users.noreply.github.com>
Date: Thu, 15 Aug 2024 20:19:03 -0700
Subject: [PATCH 7/7] Update Project.toml

---
 Project.toml | 1 -
 1 file changed, 1 deletion(-)

diff --git a/Project.toml b/Project.toml
index 3cbe2e5..ef5aaa9 100644
--- a/Project.toml
+++ b/Project.toml
@@ -43,7 +43,6 @@ Test = "1"
 URIs = "1.5"
 Unicode = "1"
 julia = "1.10"
-JSON = "0.21"
 
 [extras]
 Aqua = "4c88cf16-eb10-579e-8560-4a9242c79595"