From 57d3c10297c5a05e9cf13941275dad695cddccde Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Timoth=C3=A9e=20Poisot?= <timothee.poisot@umontreal.ca>
Date: Sat, 4 Mar 2023 15:59:07 -0500
Subject: [PATCH 01/25] =?UTF-8?q?=F0=9F=9B=91=20sanitize=20the=20download/?=
 =?UTF-8?q?extract=20step?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 deps/build.jl       | 64 ++++++-------------------------------
 src/NCBITaxonomy.jl | 25 ++++++---------
 src/hydrate.jl      | 78 +++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 96 insertions(+), 71 deletions(-)
 create mode 100644 src/hydrate.jl

diff --git a/deps/build.jl b/deps/build.jl
index b914cde..2da9dcc 100644
--- a/deps/build.jl
+++ b/deps/build.jl
@@ -3,66 +3,20 @@ import GZip
 import Tar
 import Arrow
 import DataFrames
+import Downloads
 
-# URL for the taxonomy dump
-const ncbi_ftp = "https://ftp.ncbi.nlm.nih.gov/pub/taxonomy/new_taxdump/"
-const archive = ncbi_ftp * "new_taxdump.tar.gz"
-const checksum = archive * ".md5"
-
-if !haskey(ENV, "NCBITAXONOMY_PATH")
-    @warn """
-    The environmental variable NCBITAXONOMY_PATH is not set, so the tables will
-    be stored in your home directory. This is not ideal, and you really should set
-    the NCBITAXONOMY_PATH.
-
-    This can be done by adding `ENV["NCBITAXONOMY_PATH"]` in your Julia startup
-    file. The path will be created automatically if it does not exist.
-    """
-end
-const taxpath = get(ENV, "NCBITAXONOMY_PATH", joinpath(homedir(), "NCBITaxonomy"))
-ispath(taxpath) || mkpath(taxpath)
-
-chk_file = download(checksum)
-chk = split(readlines(chk_file)[1], " ")[1]
-@info "Checksum of the most recent NCBI taxonomy: $(chk)"
-
-function download_dump(url, chk, dest)
-    @info "Downloading the taxonomy data from $(url)"
-    if ispath(joinpath(taxpath, dest))
-        @info "Removing the previous version of the taxonomy"
-        rm(joinpath(taxpath, dest); force=true, recursive=true)
-        mkpath(joinpath(taxpath, dest))
-    else
-        mkpath(joinpath(taxpath, dest))
-    end
-    arc = download(url)
-    vrf = bytes2hex(open(MD5.md5, arc))
-    vrf == chk || throw(ErrorException("Wrong checksum for the NCBI taxonomy archive file - unable to download"))
-    write(joinpath(taxpath, ".checksum"), vrf)
-    Tar.extract(GZip.open(arc), joinpath(taxpath, dest))
-end
-
-# The next block is about making sure that we don't download something that has
-# not changed when we build the package. The taxonomy dump is not gigantic, but
-# there is no need to get it over and over again.
-if !isfile(joinpath(taxpath, ".checksum"))
-    @info "No local taxonomy checksum found"
-    download_dump(archive, chk, "dump")
-else
-    local_chk = readline(joinpath(taxpath, ".checksum"))
-    if local_chk != chk
-        @info "Local and remote checksum do not match"
-        download_dump(archive, chk, "dump")
-    else
-        @info "Local taxonomy dump ($(local_chk)) is up to date"
-    end
-end
+# These steps are meant to download and unpack the taxonomy as needed, which is
+# to say as unfrequently as possible
+include(joinpath(@__DIR__, "..", "src", "hydrate.jl"))
+remote_info = _remote_archive_path()
+local_path = _local_archive_path()
+remote_checksum = _get_current_remote_checksum(local_path, remote_info)
+local_archive = _unpack_if_needed(local_path, remote_info, remote_checksum)
 
 @info "Materializing the taxonomy"
 
 # We will store the tables used by the package in the tables folder
-tables = joinpath(taxpath, "tables")
-ispath(tables) || mkpath(tables)
+tables_path = _create_or_get_tables_path(local_path)
 
 # Utility functions
 
diff --git a/src/NCBITaxonomy.jl b/src/NCBITaxonomy.jl
index 5d01411..fc27fbe 100644
--- a/src/NCBITaxonomy.jl
+++ b/src/NCBITaxonomy.jl
@@ -4,24 +4,17 @@ using Arrow
 using StringDistances
 using AbstractTrees
 
-if !haskey(ENV, "NCBITAXONOMY_PATH")
-    @warn """
-    The environmental variable NCBITAXONOMY_PATH is not set, so the tables will
-    be stored in the package path. This is not ideal, and you really should set
-    the NCBITAXONOMY_PATH.
-
-    This can be done by adding `ENV["NCBITAXONOMY_PATH"]` in your Julia startup
-    file. The path will be created automatically if it does not exist.
-    """
-end
-const taxpath = get(ENV, "NCBITAXONOMY_PATH", joinpath(homedir(), "NCBITaxonomy"))
-ispath(taxpath) || mkpath(taxpath)
+# Point to where the taxonomy is located
+include("hydrate.jl")
+local_path = _local_archive_path()
 
 function __init__()
-    name_date = mtime(joinpath(taxpath, "tables", "names.arrow"))
-    return time() - name_date >= 2.6e+6 && @warn(
-        "Your local taxonomy version is over 30 days old, we recommend using `] build NCBITaxonomy` to get the most recent version."
-    )
+    name_date = mtime(joinpath(local_path, "tables", "names.arrow"))
+    over_30_days = time() - name_date >= 2.6e+6
+    if over_30_days 
+        @warn("Your local taxonomy version is over 30 days old, we recommend using `] build NCBITaxonomy` to get the most recent version.")
+    end
+    return nothing
 end
 
 include("types.jl")
diff --git a/src/hydrate.jl b/src/hydrate.jl
new file mode 100644
index 0000000..0cf76b6
--- /dev/null
+++ b/src/hydrate.jl
@@ -0,0 +1,78 @@
+function _remote_archive_path(;
+    ncbi_ftp = "https://ftp.ncbi.nlm.nih.gov/pub/taxonomy/new_taxdump/",
+)::NamedTuple{
+    (:url, :archive, :checksum),
+    Tuple{String, String, String},
+}
+    return (
+        url = ncbi_ftp,
+        archive = "new_taxdump.tar.gz",
+        checksum = "new_taxdump.tar.gz.md5",
+    )
+end
+
+"""
+    _local_archive_path()
+
+Returns the path where the taxonomy dump is stored, and throws a warning if the
+path is not set as an environmental variable. This is used during the build step
+*and* during the initial startup of the package.
+"""
+function _local_archive_path()::String
+    if !haskey(ENV, "NCBITAXONOMY_PATH")
+        @warn """
+        The environmental variable NCBITAXONOMY_PATH is not set, so the tables will
+        be stored in your home directory. This is not ideal, and you really should set
+        the NCBITAXONOMY_PATH.
+
+        This can be done by adding `ENV["NCBITAXONOMY_PATH"]` in your Julia startup
+        file. The path will be created automatically if it does not exist.
+        """
+    end
+    taxpath = get(ENV, "NCBITAXONOMY_PATH", joinpath(homedir(), "NCBITaxonomy"))
+    ispath(taxpath) || mkpath(taxpath)
+    return taxpath
+end
+
+function _get_current_remote_checksum(local_path, remote_info)
+    chk_file = Downloads.download(
+        remote_info.url * remote_info.checksum,
+        joinpath(local_path, ".checksum.remote"),
+    )
+    return split(readlines(chk_file)[1], " ")[1]
+end
+
+function _download_archive(local_path, remote_info)
+    Downloads.download(
+        remote_info.url * remote_info.archive,
+        joinpath(local_path, remote_info.archive),
+    )
+    return joinpath(local_path, remote_info.archive)
+end
+
+function _unpack_if_needed(local_path, remote_info, remote_checksum)
+    local_archive = joinpath(local_path, remote_info.archive)
+    need_update = false
+    if ~isfile(local_archive)
+        local_archive = _download_archive(local_path, remote_info)
+        local_checksum = bytes2hex(open(MD5.md5, local_archive))
+        need_update = true
+    else
+        local_checksum = bytes2hex(open(MD5.md5, local_archive))
+    end
+    if local_checksum != remote_checksum
+        local_archive = _download_archive(local_path, remote_info)
+        local_checksum = bytes2hex(open(MD5.md5, local_archive))
+        need_update = true
+    end
+    if need_update
+        Tar.extract(GZip.open(local_archive), joinpath(local_path, "dump"))
+    end
+    return joinpath(local_path, "dump")
+end
+
+function _create_or_get_tables_path(local_path)
+    tables_path = joinpath(local_path, "tables")
+    ispath(tables_path) || mkpath(tables_path)
+    return tables_path
+end
\ No newline at end of file

From c854acaaa43aef84ae9cca623618b8626b0a3e55 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Timoth=C3=A9e=20Poisot?= <timothee.poisot@umontreal.ca>
Date: Sat, 4 Mar 2023 16:19:43 -0500
Subject: [PATCH 02/25] =?UTF-8?q?=F0=9F=9A=AE=20only=20check=20the=20local?=
 =?UTF-8?q?=20checksum=20after=20making=20sure=20the=20archive=20is=20here?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 deps/build.jl  | 78 ++++++++++++++++++++++++++++----------------------
 src/hydrate.jl |  5 ++--
 2 files changed, 45 insertions(+), 38 deletions(-)

diff --git a/deps/build.jl b/deps/build.jl
index 2da9dcc..eb580b3 100644
--- a/deps/build.jl
+++ b/deps/build.jl
@@ -5,23 +5,22 @@ import Arrow
 import DataFrames
 import Downloads
 
+# There are two things we need for the build process: the types, and the
+# location of the files
+include(joinpath(@__DIR__, "..", "src", "hydrate.jl"))
+include(joinpath(@__DIR__, "..", "src", "types.jl"))
+
 # These steps are meant to download and unpack the taxonomy as needed, which is
 # to say as unfrequently as possible
-include(joinpath(@__DIR__, "..", "src", "hydrate.jl"))
 remote_info = _remote_archive_path()
 local_path = _local_archive_path()
 remote_checksum = _get_current_remote_checksum(local_path, remote_info)
 local_archive = _unpack_if_needed(local_path, remote_info, remote_checksum)
 
-@info "Materializing the taxonomy"
-
 # We will store the tables used by the package in the tables folder
 tables_path = _create_or_get_tables_path(local_path)
 
 # Utility functions
-
-include(joinpath(@__DIR__, "..", "src", "types.jl"))
-
 function _class_to_enum(c::T) where {T <: String}
     c = replace(c, " " => "_")
     c = replace(c, "-" => "_")
@@ -38,8 +37,8 @@ issue.
 function _materialize_data(::Type{T}, v) where {T}
     if v != ""
         T <: Number && return parse(T, v)
-        T <: Union{Bool,Missing} && return parse(Bool, v)
-        T <: Union{Int,Missing} && return parse(Int, v)
+        T <: Union{Bool, Missing} && return parse(Bool, v)
+        T <: Union{Int, Missing} && return parse(Int, v)
         T <: Symbol && return Symbol(v)
         T <: NCBINameClass && return _class_to_enum(v)
         return v
@@ -61,44 +60,53 @@ function _build_arrow_file(df, dump_file)
     return df
 end
 
-# Get the data
-
-@info "Building the names file"
-ncbi_names_file_in = joinpath(taxpath, "dump", "names.dmp")
-ncbi_names_file_out = joinpath(taxpath, "tables", "names.arrow")
-ncbi_names = DataFrames.DataFrame(tax_id=Int[], name=String[], unique_name=Union{String,Missing}[], class=NCBINameClass[])
+# Get the data for the names
+ncbi_names_file_in = joinpath(local_path, "dump", "names.dmp")
+ncbi_names_file_out = joinpath(tables_path, "names.arrow")
+ncbi_names = DataFrames.DataFrame(;
+    tax_id = Int[],
+    name = String[],
+    unique_name = Union{String, Missing}[],
+    class = NCBINameClass[],
+)
 names_df = _build_arrow_file(ncbi_names, ncbi_names_file_in)
 names_df.class = Int.(names_df.class)
 Arrow.write(ncbi_names_file_out, names_df)
 names_df = nothing
 GC.gc()
 
-@info "Building the division file"
-ncbi_division_file_in = joinpath(taxpath, "dump", "division.dmp")
-ncbi_division_file_out = joinpath(taxpath, "tables", "division.arrow")
-ncbi_division = DataFrames.DataFrame(division_id=Int[], division_code=Symbol[], division_name=Symbol[], comments=Union{String,Missing}[])
+ncbi_division_file_in = joinpath(local_path, "dump", "division.dmp")
+ncbi_division_file_out = joinpath(tables_path, "division.arrow")
+ncbi_division = DataFrames.DataFrame(;
+    division_id = Int[],
+    division_code = Symbol[],
+    division_name = Symbol[],
+    comments = Union{String, Missing}[],
+)
 division_df = _build_arrow_file(ncbi_division, ncbi_division_file_in)
 Arrow.write(ncbi_division_file_out, division_df)
 division_df = nothing
 GC.gc()
 
-@info "Building the nodes file"
-ncbi_nodes_file_in = joinpath(taxpath, "dump", "nodes.dmp")
-ncbi_nodes_file_out = joinpath(taxpath, "tables", "nodes.arrow")
-ncbi_nodes = DataFrames.DataFrame(
-    tax_id=Int[], parent_tax_id=Int[],
-    rank=Symbol[],
-    embl=Union{String,Missing}[],
-    division_id=Int[], inherited_div=Union{Bool,Missing}[],
-    genetic_code_id=Int[], inherited_gc=Union{Bool,Missing}[], 
-    mitochondrial_genetic_code_id=Union{Int,Missing}[], inherited_mgc=Union{Bool,Missing}[],
-    genbank_hidden=Union{Bool,Missing}[],
-    hidden_subtree=Union{Bool,Missing}[],
-    comments=Union{String,Missing}[],
-    plastid_genetic_code_id=Union{Int,Missing}[], inherited_pgc=Union{Bool,Missing}[],
-    specified_species=Union{Bool,Missing}[],
-    hydrogenosome_code_id=Union{Int,Missing}[], inherited_hgc=Union{Bool,Missing}[]
-    )
+ncbi_nodes_file_in = joinpath(local_path, "dump", "nodes.dmp")
+ncbi_nodes_file_out = joinpath(tables_path, "nodes.arrow")
+ncbi_nodes = DataFrames.DataFrame(;
+    tax_id = Int[], parent_tax_id = Int[],
+    rank = Symbol[],
+    embl = Union{String, Missing}[],
+    division_id = Int[], inherited_div = Union{Bool, Missing}[],
+    genetic_code_id = Int[], inherited_gc = Union{Bool, Missing}[],
+    mitochondrial_genetic_code_id = Union{Int, Missing}[],
+    inherited_mgc = Union{Bool, Missing}[],
+    genbank_hidden = Union{Bool, Missing}[],
+    hidden_subtree = Union{Bool, Missing}[],
+    comments = Union{String, Missing}[],
+    plastid_genetic_code_id = Union{Int, Missing}[],
+    inherited_pgc = Union{Bool, Missing}[],
+    specified_species = Union{Bool, Missing}[],
+    hydrogenosome_code_id = Union{Int, Missing}[],
+    inherited_hgc = Union{Bool, Missing}[],
+)
 nodes_df = _build_arrow_file(ncbi_nodes, ncbi_nodes_file_in)
 Arrow.write(ncbi_nodes_file_out, nodes_df)
 nodes_df = nothing
diff --git a/src/hydrate.jl b/src/hydrate.jl
index 0cf76b6..b6c3011 100644
--- a/src/hydrate.jl
+++ b/src/hydrate.jl
@@ -55,17 +55,16 @@ function _unpack_if_needed(local_path, remote_info, remote_checksum)
     need_update = false
     if ~isfile(local_archive)
         local_archive = _download_archive(local_path, remote_info)
-        local_checksum = bytes2hex(open(MD5.md5, local_archive))
         need_update = true
-    else
-        local_checksum = bytes2hex(open(MD5.md5, local_archive))
     end
+    local_checksum = bytes2hex(open(MD5.md5, local_archive))
     if local_checksum != remote_checksum
         local_archive = _download_archive(local_path, remote_info)
         local_checksum = bytes2hex(open(MD5.md5, local_archive))
         need_update = true
     end
     if need_update
+        rm(joinpath(local_path, "dump"); force=true)
         Tar.extract(GZip.open(local_archive), joinpath(local_path, "dump"))
     end
     return joinpath(local_path, "dump")

From eae76182675c8e002b683c5be53436d1393d7fdc Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Timoth=C3=A9e=20Poisot?= <timothee.poisot@umontreal.ca>
Date: Sat, 4 Mar 2023 16:24:01 -0500
Subject: [PATCH 03/25] =?UTF-8?q?=E2=9C=A8=20build=20step=20cleaned=20up?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 src/hydrate.jl | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/src/hydrate.jl b/src/hydrate.jl
index b6c3011..a3902df 100644
--- a/src/hydrate.jl
+++ b/src/hydrate.jl
@@ -54,16 +54,19 @@ function _unpack_if_needed(local_path, remote_info, remote_checksum)
     local_archive = joinpath(local_path, remote_info.archive)
     need_update = false
     if ~isfile(local_archive)
+        @warn "There is no local taxonomy dump, we will download one"
         local_archive = _download_archive(local_path, remote_info)
         need_update = true
     end
     local_checksum = bytes2hex(open(MD5.md5, local_archive))
     if local_checksum != remote_checksum
+        @warn "The checksum of the taxonomy dump does not match the remote"
         local_archive = _download_archive(local_path, remote_info)
         local_checksum = bytes2hex(open(MD5.md5, local_archive))
         need_update = true
     end
     if need_update
+        @warn "We are unpacking the local taxonomy dump"
         rm(joinpath(local_path, "dump"); force=true)
         Tar.extract(GZip.open(local_archive), joinpath(local_path, "dump"))
     end

From 16838013d4fccdd0dd96d0877780eccd10eba39d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Timoth=C3=A9e=20Poisot?= <timothee.poisot@umontreal.ca>
Date: Sat, 4 Mar 2023 16:43:24 -0500
Subject: [PATCH 04/25] =?UTF-8?q?=E2=9A=99=EF=B8=8F=20move=20the=20code=20?=
 =?UTF-8?q?to=20read=20the=20taxonomy=20out=20of=20the=20main=20file?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 deps/build.jl             |  3 ++-
 {src => deps}/hydrate.jl  | 23 -----------------------
 src/NCBITaxonomy.jl       | 35 +++++++----------------------------
 src/local_archive_path.jl | 28 ++++++++++++++++++++++++++++
 src/read_taxonomy.jl      | 34 ++++++++++++++++++++++++++++++++++
 5 files changed, 71 insertions(+), 52 deletions(-)
 rename {src => deps}/hydrate.jl (69%)
 create mode 100644 src/local_archive_path.jl
 create mode 100644 src/read_taxonomy.jl

diff --git a/deps/build.jl b/deps/build.jl
index eb580b3..5eaaa56 100644
--- a/deps/build.jl
+++ b/deps/build.jl
@@ -7,8 +7,9 @@ import Downloads
 
 # There are two things we need for the build process: the types, and the
 # location of the files
-include(joinpath(@__DIR__, "..", "src", "hydrate.jl"))
+include(joinpath(@__DIR__, "hydrate.jl"))
 include(joinpath(@__DIR__, "..", "src", "types.jl"))
+include(joinpath(@__DIR__, "..", "src", "local_archive_path.jl"))
 
 # These steps are meant to download and unpack the taxonomy as needed, which is
 # to say as unfrequently as possible
diff --git a/src/hydrate.jl b/deps/hydrate.jl
similarity index 69%
rename from src/hydrate.jl
rename to deps/hydrate.jl
index a3902df..8504e64 100644
--- a/src/hydrate.jl
+++ b/deps/hydrate.jl
@@ -11,29 +11,6 @@ function _remote_archive_path(;
     )
 end
 
-"""
-    _local_archive_path()
-
-Returns the path where the taxonomy dump is stored, and throws a warning if the
-path is not set as an environmental variable. This is used during the build step
-*and* during the initial startup of the package.
-"""
-function _local_archive_path()::String
-    if !haskey(ENV, "NCBITAXONOMY_PATH")
-        @warn """
-        The environmental variable NCBITAXONOMY_PATH is not set, so the tables will
-        be stored in your home directory. This is not ideal, and you really should set
-        the NCBITAXONOMY_PATH.
-
-        This can be done by adding `ENV["NCBITAXONOMY_PATH"]` in your Julia startup
-        file. The path will be created automatically if it does not exist.
-        """
-    end
-    taxpath = get(ENV, "NCBITAXONOMY_PATH", joinpath(homedir(), "NCBITaxonomy"))
-    ispath(taxpath) || mkpath(taxpath)
-    return taxpath
-end
-
 function _get_current_remote_checksum(local_path, remote_info)
     chk_file = Downloads.download(
         remote_info.url * remote_info.checksum,
diff --git a/src/NCBITaxonomy.jl b/src/NCBITaxonomy.jl
index fc27fbe..bf5a69a 100644
--- a/src/NCBITaxonomy.jl
+++ b/src/NCBITaxonomy.jl
@@ -1,15 +1,15 @@
 module NCBITaxonomy
 using DataFrames
-using Arrow
+import Arrow
 using StringDistances
 using AbstractTrees
 
 # Point to where the taxonomy is located
-include("hydrate.jl")
-local_path = _local_archive_path()
+include("local_archive_path.jl")
+tables_path = _create_or_get_tables_path(_local_archive_path())
 
 function __init__()
-    name_date = mtime(joinpath(local_path, "tables", "names.arrow"))
+    name_date = mtime(joinpath(tables_path, "names.arrow"))
     over_30_days = time() - name_date >= 2.6e+6
     if over_30_days 
         @warn("Your local taxonomy version is over 30 days old, we recommend using `] build NCBITaxonomy` to get the most recent version.")
@@ -23,30 +23,9 @@ export NCBITaxon, NCBINameClass, IDNotFoundInBackbone
 include("exceptions.jl")
 export NameHasNoDirectMatch, NameHasMultipleMatches
 
-names_table = DataFrame(Arrow.Table(joinpath(taxpath, "tables", "names.arrow")))
-names_table.class = NCBINameClass.(names_table.class)
-names_table.lowercase = lowercase.(names_table.name)
-
-division_table = DataFrame(Arrow.Table(joinpath(taxpath, "tables", "division.arrow")))
-select!(division_table, Not(:comments))
-
-nodes_table = DataFrame(Arrow.Table(joinpath(taxpath, "tables", "nodes.arrow")))
-select!(nodes_table, Not(r"inherited_"))
-select!(nodes_table, Not(r"_code_id"))
-select!(nodes_table, Not(:genbank_hidden))
-select!(nodes_table, Not(:hidden_subtree))
-select!(nodes_table, Not(:comments))
-select!(nodes_table, Not(:embl))
-
-nodes_table = innerjoin(nodes_table, division_table; on = :division_id)
-select!(nodes_table, Not(:division_id))
-
-names_table = leftjoin(
-    names_table,
-    unique(select(nodes_table, [:tax_id, :rank, :parent_tax_id]));
-    on = :tax_id,
-)
-scinames_table = names_table[findall(names_table.class .== class_scientific_name), :]
+# We load the core file with all we need in it
+include("read_taxonomy.jl")
+taxonomy = read_taxonomy(tables_path)
 
 include("taxon.jl")
 export taxon, @ncbi_str
diff --git a/src/local_archive_path.jl b/src/local_archive_path.jl
new file mode 100644
index 0000000..252ca2c
--- /dev/null
+++ b/src/local_archive_path.jl
@@ -0,0 +1,28 @@
+"""
+    _local_archive_path()
+
+Returns the path where the taxonomy dump is stored, and throws a warning if the
+path is not set as an environmental variable. This is used during the build step
+*and* during the initial startup of the package.
+"""
+function _local_archive_path()::String
+    if !haskey(ENV, "NCBITAXONOMY_PATH")
+        @warn """
+        The environmental variable NCBITAXONOMY_PATH is not set, so the tables will
+        be stored in your home directory. This is not ideal, and you really should set
+        the NCBITAXONOMY_PATH.
+
+        This can be done by adding `ENV["NCBITAXONOMY_PATH"]` in your Julia startup
+        file. The path will be created automatically if it does not exist.
+        """
+    end
+    taxpath = get(ENV, "NCBITAXONOMY_PATH", joinpath(homedir(), "NCBITaxonomy"))
+    ispath(taxpath) || mkpath(taxpath)
+    return taxpath
+end
+
+function _create_or_get_tables_path(local_path)
+    tables_path = joinpath(local_path, "tables")
+    ispath(tables_path) || mkpath(tables_path)
+    return tables_path
+end
\ No newline at end of file
diff --git a/src/read_taxonomy.jl b/src/read_taxonomy.jl
new file mode 100644
index 0000000..4a43f6e
--- /dev/null
+++ b/src/read_taxonomy.jl
@@ -0,0 +1,34 @@
+function read_taxonomy(tables_path)
+
+    # Prepare the files we will actually use
+    names_table = DataFrame(Arrow.Table(joinpath(tables_path, "names.arrow")))
+    names_table.class = NCBINameClass.(names_table.class)
+    names_table.lowercase = lowercase.(names_table.name)
+
+    division_table = DataFrame(Arrow.Table(joinpath(tables_path, "division.arrow")))
+    select!(division_table, Not(:comments))
+
+    nodes_table = DataFrame(Arrow.Table(joinpath(tables_path, "nodes.arrow")))
+    select!(nodes_table, Not(r"inherited_"))
+    select!(nodes_table, Not(r"_code_id"))
+    select!(nodes_table, Not(:genbank_hidden))
+    select!(nodes_table, Not(:hidden_subtree))
+    select!(nodes_table, Not(:comments))
+    select!(nodes_table, Not(:embl))
+
+    nodes_table = innerjoin(nodes_table, division_table; on = :division_id)
+    select!(nodes_table, Not(:division_id))
+
+    names_table = leftjoin(
+        names_table,
+        unique(select(nodes_table, [:tax_id, :rank, :parent_tax_id]));
+        on = :tax_id,
+    )
+
+    nodes_table = nothing
+    division_table = nothing
+    GC.gc()
+
+    return names_table
+
+end
\ No newline at end of file

From 36306cf75ca71128e811db2402dfaac2619013e8 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Timoth=C3=A9e=20Poisot?= <timothee.poisot@umontreal.ca>
Date: Sat, 4 Mar 2023 16:56:24 -0500
Subject: [PATCH 05/25] =?UTF-8?q?=E2=9A=A1=20only=20keep=20two=20taxo=20db?=
 =?UTF-8?q?s?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 src/NCBITaxonomy.jl             | 12 ++++++++----
 src/interfaces/abstracttrees.jl | 16 ++++++++--------
 src/namefilters/namefilter.jl   |  2 +-
 src/taxon.jl                    |  6 +++---
 src/utility/nametools.jl        |  6 +++---
 src/utility/similarnames.jl     |  6 ++----
 6 files changed, 25 insertions(+), 23 deletions(-)

diff --git a/src/NCBITaxonomy.jl b/src/NCBITaxonomy.jl
index bf5a69a..160cf29 100644
--- a/src/NCBITaxonomy.jl
+++ b/src/NCBITaxonomy.jl
@@ -11,21 +11,25 @@ tables_path = _create_or_get_tables_path(_local_archive_path())
 function __init__()
     name_date = mtime(joinpath(tables_path, "names.arrow"))
     over_30_days = time() - name_date >= 2.6e+6
-    if over_30_days 
-        @warn("Your local taxonomy version is over 30 days old, we recommend using `] build NCBITaxonomy` to get the most recent version.")
+    if over_30_days
+        @warn(
+            "Your local taxonomy version is over 30 days old, we recommend using `] build NCBITaxonomy` to get the most recent version."
+        )
     end
     return nothing
 end
 
 include("types.jl")
-export NCBITaxon, NCBINameClass, IDNotFoundInBackbone
+export NCBITaxon, NCBINameClass
 
 include("exceptions.jl")
-export NameHasNoDirectMatch, NameHasMultipleMatches
+export NameHasNoDirectMatch, NameHasMultipleMatches, IDNotFoundInBackbone
 
 # We load the core file with all we need in it
 include("read_taxonomy.jl")
 taxonomy = read_taxonomy(tables_path)
+scinames = filter(r -> r.class == NCBITaxonomy.class_scientific_name, taxonomy)
+groupedscinames = groupby(scinames, :tax_id)
 
 include("taxon.jl")
 export taxon, @ncbi_str
diff --git a/src/interfaces/abstracttrees.jl b/src/interfaces/abstracttrees.jl
index b50b24a..089d274 100644
--- a/src/interfaces/abstracttrees.jl
+++ b/src/interfaces/abstracttrees.jl
@@ -4,13 +4,13 @@
 Returns the children of a taxon.
 """
 function AbstractTrees.children(tax::NCBITaxon)
-    positions = findall(isequal(tax.id), NCBITaxonomy.scinames_table.parent_tax_id)
+    positions = findall(isequal(tax.id), NCBITaxonomy.scinames.parent_tax_id)
     if ~isempty(positions)
         list_of_children = Vector{NCBITaxon}(undef, length(positions))
         for i in axes(positions, 1)
             list_of_children[i] = NCBITaxon(
-                NCBITaxonomy.scinames_table.name[positions[i]],
-                NCBITaxonomy.scinames_table.tax_id[positions[i]],
+                NCBITaxonomy.scinames.name[positions[i]],
+                NCBITaxonomy.scinames.tax_id[positions[i]],
             )
         end
         return list_of_children
@@ -25,15 +25,15 @@ end
 Returns the taxon from which the argument taxon is descended.
 """
 function AbstractTrees.parent(tax::NCBITaxon)
-    position = findfirst(isequal(tax.id), NCBITaxonomy.scinames_table.tax_id)
+    position = findfirst(isequal(tax.id), NCBITaxonomy.scinames.tax_id)
     if ~isnothing(position)
         parent_position = findfirst(
-            isequal(NCBITaxonomy.scinames_table.parent_tax_id[position]),
-            NCBITaxonomy.scinames_table.tax_id,
+            isequal(NCBITaxonomy.scinames.parent_tax_id[position]),
+            NCBITaxonomy.scinames.tax_id,
         )
         return NCBITaxon(
-            NCBITaxonomy.scinames_table.name[parent_position],
-            NCBITaxonomy.scinames_table.tax_id[parent_position],
+            NCBITaxonomy.scinames.name[parent_position],
+            NCBITaxonomy.scinames.tax_id[parent_position],
         )
     else
         return nothing
diff --git a/src/namefilters/namefilter.jl b/src/namefilters/namefilter.jl
index 24a9960..56e1d42 100644
--- a/src/namefilters/namefilter.jl
+++ b/src/namefilters/namefilter.jl
@@ -4,7 +4,7 @@
 Returns a subset of the names table where only the given taxids are present.
 """
 function namefilter(ids::Vector{T}) where {T <: Integer}
-    return leftjoin(DataFrame(; tax_id = ids), NCBITaxonomy.names_table; on = :tax_id)
+    return leftjoin(DataFrame(; tax_id = ids), NCBITaxonomy.taxonomy; on = :tax_id)
 end
 
 """
diff --git a/src/taxon.jl b/src/taxon.jl
index 2d14c1b..eb08a6a 100644
--- a/src/taxon.jl
+++ b/src/taxon.jl
@@ -15,10 +15,10 @@ end
 
 Performs a search in the entire taxonomy backbone based on a known ID.
 """
-taxon(id::Integer) = taxon(NCBITaxonomy.scinames_table, id)
+taxon(id::Integer) = taxon(NCBITaxonomy.scinames, id)
 
 function _id_from_name(name::AbstractString; kwargs...)
-    return _id_from_name(NCBITaxonomy.names_table, name; kwargs...)
+    return _id_from_name(NCBITaxonomy.taxonomy, name; kwargs...)
 end
 
 function _id_from_name(
@@ -91,7 +91,7 @@ The keywords are:
   - `onlysynonyms` (def. `false`) - limits the search to synonyms, which may be
     useful in case the taxonomy is particularly outdated
 """
-taxon(name::AbstractString; kwargs...) = taxon(NCBITaxonomy.names_table, name; kwargs...)
+taxon(name::AbstractString; kwargs...) = taxon(NCBITaxonomy.taxonomy, name; kwargs...)
 
 """
     taxon(df::DataFrame, name::AbstractString; kwargs...)
diff --git a/src/utility/nametools.jl b/src/utility/nametools.jl
index 59377f5..6d08f98 100644
--- a/src/utility/nametools.jl
+++ b/src/utility/nametools.jl
@@ -6,7 +6,7 @@ of names if found. It searches the "common name" and "genbank common name"
 category of the NCBI taxonomy name table.
 """
 function vernacular(t::NCBITaxon)
-    x = NCBITaxonomy.names_table[findall(NCBITaxonomy.names_table.tax_id .== t.id), :]
+    x = NCBITaxonomy.taxonomy[findall(NCBITaxonomy.taxonomy.tax_id .== t.id), :]
     p = findall(
         !isnothing,
         indexin(
@@ -24,7 +24,7 @@ This function will return `nothing` if no synonyms exist, and an array of names
 if they do. It returns all of the
 """
 function synonyms(t::NCBITaxon)
-    x = NCBITaxonomy.names_table[findall(NCBITaxonomy.names_table.tax_id .== t.id), :]
+    x = NCBITaxonomy.taxonomy[findall(NCBITaxonomy.taxonomy.tax_id .== t.id), :]
     p = findall(isequal(NCBITaxonomy.class_synonym), x.class)
     return length(p) == 0 ? nothing : x.name[p]
 end
@@ -36,7 +36,7 @@ This function will return `nothing` if no authority exist, and a string with the
 authority if found.
 """
 function authority(t::NCBITaxon)
-    x = NCBITaxonomy.names_table[findall(NCBITaxonomy.names_table.tax_id .== t.id), :]
+    x = NCBITaxonomy.taxonomy[findall(NCBITaxonomy.taxonomy.tax_id .== t.id), :]
     p = findall(isequal(NCBITaxonomy.class_authority), x.class)
     return length(p) == 0 ? nothing : first(x.name[p])
 end
diff --git a/src/utility/similarnames.jl b/src/utility/similarnames.jl
index 218f571..2511eee 100644
--- a/src/utility/similarnames.jl
+++ b/src/utility/similarnames.jl
@@ -19,8 +19,7 @@ distance - the pair will be returned only once.
 Additional keywords are `rank` (limit to a given rank) and `onlysynonyms`.
 """
 function similarnames(name::AbstractString; kwargs...)
-    df = NCBITaxonomy.names_table
-    return similarnames(df, name; kwargs...)
+    return similarnames(NCBITaxonomy.taxonomy, name; kwargs...)
 end
 
 """
@@ -72,6 +71,5 @@ does *strict*, *case-sensitive* searches only at the moment, but this may be
 extended through keyword arguments in a future release.
 """
 function alternativetaxa(name::AbstractString)
-    df = NCBITaxonomy.names_table
-    return alternativetaxa(df, name)
+    return alternativetaxa(NCBITaxonomy.taxonomy, name)
 end
\ No newline at end of file

From 9340b8598f6dafde85f2cb4706e93f293923e2a9 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Timoth=C3=A9e=20Poisot?= <timothee.poisot@umontreal.ca>
Date: Sat, 4 Mar 2023 16:57:34 -0500
Subject: [PATCH 06/25] =?UTF-8?q?=E2=9E=95=20Downloads=20(tmp.=20Revise)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 Project.toml | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/Project.toml b/Project.toml
index 63d4dac..eaf79e3 100644
--- a/Project.toml
+++ b/Project.toml
@@ -7,8 +7,10 @@ version = "0.4.1"
 AbstractTrees = "1520ce14-60c1-5f80-bbc7-55ef81b5835c"
 Arrow = "69666777-d1a9-59fb-9406-91d4454c9d45"
 DataFrames = "a93c6f00-e57d-5684-b7b6-d8193f3e46c0"
+Downloads = "f43a241f-c20a-4ad4-852c-f6b1247861c6"
 GZip = "92fee26a-97fe-5a0c-ad85-20a5f3185b63"
 MD5 = "6ac74813-4b46-53a4-afec-0b5dc9d7885c"
+Revise = "295af30f-e4ad-537b-8983-00126c2a3abe"
 StringDistances = "88034a9c-02f8-509d-84a9-84ec65e18404"
 Tar = "a4e569a6-e804-4fa4-b0f3-eef7a1d5b13e"
 

From b65d0dd52b680e3679450b33010bf3b54f9945a3 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Timoth=C3=A9e=20Poisot?= <timothee.poisot@umontreal.ca>
Date: Sat, 4 Mar 2023 17:10:41 -0500
Subject: [PATCH 07/25] =?UTF-8?q?=E2=9A=A1=20speedy=20lookup=20by=20ID?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 src/taxon.jl | 23 +++++++++--------------
 1 file changed, 9 insertions(+), 14 deletions(-)

diff --git a/src/taxon.jl b/src/taxon.jl
index eb08a6a..fef410a 100644
--- a/src/taxon.jl
+++ b/src/taxon.jl
@@ -1,21 +1,16 @@
-"""
-    taxon(df::DataFrame, id::Integer)
-
-Returns a fully formed `NCBITaxon` based on its id. The `name` of the taxon
-will be the valid scientic name associated to this id.
-"""
-function taxon(df::DataFrame, id::Integer)
-    matched_index = findfirst(isequal(id), df.tax_id)
-    isnothing(matched_index) && throw(IDNotFoundInBackbone(id))
-    return NCBITaxon(df.name[matched_index], id)
-end
-
 """
     taxon(id::Integer)
 
-Performs a search in the entire taxonomy backbone based on a known ID.
+Performs a search in the entire taxonomy backbone based on a known ID. This is
+the fastest way to get to a taxon, and is used internally by the tree traversal methods.
 """
-taxon(id::Integer) = taxon(NCBITaxonomy.scinames, id)
+function taxon(id::Integer)
+    (id in NCBITaxonomy.scinames.tax_id) || throw(IDNotFoundInBackbone(id))
+    return NCBITaxon(
+        only(NCBITaxonomy.groupedscinames[id].name),
+        only(NCBITaxonomy.groupedscinames[id].tax_id)
+    )
+end
 
 function _id_from_name(name::AbstractString; kwargs...)
     return _id_from_name(NCBITaxonomy.taxonomy, name; kwargs...)

From 4933e8303095259fa6a536f4f9c2a47e51874f9d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Timoth=C3=A9e=20Poisot?= <timothee.poisot@umontreal.ca>
Date: Sat, 4 Mar 2023 17:17:03 -0500
Subject: [PATCH 08/25] =?UTF-8?q?=E2=9A=99=EF=B8=8F=20use=20filter=20in=20?=
 =?UTF-8?q?namefilter?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 src/namefilters/namefilter.jl | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/src/namefilters/namefilter.jl b/src/namefilters/namefilter.jl
index 56e1d42..ccd5cc7 100644
--- a/src/namefilters/namefilter.jl
+++ b/src/namefilters/namefilter.jl
@@ -23,8 +23,7 @@ end
 Returns a subset of the names table for all names under a given NCBI division.
 """
 function namefilter(division::Symbol)
-    ids = findall(isequal(division), NCBITaxonomy.nodes_table.division_code)
-    return namefilter(NCBITaxonomy.nodes_table.tax_id[ids])
+    return namefilter(filter(r -> r.division_code == division, NCBITaxonomy.taxonomy))
 end
 
 """
@@ -34,6 +33,5 @@ Returns a subset of the names table for all names under a number of multiple
 NCBI divisions.
 """
 function namefilter(division::Vector{Symbol})
-    ids = findall(x -> x in division, NCBITaxonomy.nodes_table.division_code)
-    return namefilter(NCBITaxonomy.nodes_table.tax_id[ids])
+    return namefilter(filter(r -> r.division_code in division, NCBITaxonomy.taxonomy))
 end
\ No newline at end of file

From dd430bceaf2dc9dc1554bfef561c29b8b66c52bf Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Timoth=C3=A9e=20Poisot?= <timothee.poisot@umontreal.ca>
Date: Sat, 4 Mar 2023 17:17:17 -0500
Subject: [PATCH 09/25] =?UTF-8?q?=E2=9A=99=EF=B8=8F=20keep=20division=20co?=
 =?UTF-8?q?de?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 src/read_taxonomy.jl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/read_taxonomy.jl b/src/read_taxonomy.jl
index 4a43f6e..347132a 100644
--- a/src/read_taxonomy.jl
+++ b/src/read_taxonomy.jl
@@ -21,7 +21,7 @@ function read_taxonomy(tables_path)
 
     names_table = leftjoin(
         names_table,
-        unique(select(nodes_table, [:tax_id, :rank, :parent_tax_id]));
+        unique(select(nodes_table, [:tax_id, :rank, :parent_tax_id, :division_code]));
         on = :tax_id,
     )
 

From efd090256a69044fbd3299ce0d1f65c2561fbb15 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Timoth=C3=A9e=20Poisot?= <timothee.poisot@umontreal.ca>
Date: Sat, 4 Mar 2023 17:17:31 -0500
Subject: [PATCH 10/25] =?UTF-8?q?=E2=9A=A1=20direct=20lookup=20for=20rank?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 src/lineage/rank.jl | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/src/lineage/rank.jl b/src/lineage/rank.jl
index 988ec70..a8bd880 100644
--- a/src/lineage/rank.jl
+++ b/src/lineage/rank.jl
@@ -4,6 +4,5 @@
 Returns the rank of a taxon.
 """
 function rank(tax::NCBITaxon)
-    position = findfirst(isequal(tax.id), NCBITaxonomy.nodes_table.tax_id)
-    return NCBITaxonomy.nodes_table.rank[position]
+    return only(NCBITaxonomy.groupedscinames[tax.id].rank)
 end

From b276c7bda7af37822dcf3de7a7e3ad83325881d8 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Timoth=C3=A9e=20Poisot?= <timothee.poisot@umontreal.ca>
Date: Sat, 4 Mar 2023 17:29:10 -0500
Subject: [PATCH 11/25] =?UTF-8?q?=F0=9F=90=9B=20namefilter=20from=20df=20f?=
 =?UTF-8?q?ilter?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 src/namefilters/namefilter.jl | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/namefilters/namefilter.jl b/src/namefilters/namefilter.jl
index ccd5cc7..1e3a588 100644
--- a/src/namefilters/namefilter.jl
+++ b/src/namefilters/namefilter.jl
@@ -23,7 +23,7 @@ end
 Returns a subset of the names table for all names under a given NCBI division.
 """
 function namefilter(division::Symbol)
-    return namefilter(filter(r -> r.division_code == division, NCBITaxonomy.taxonomy))
+    return filter(r -> r.division_code == division, NCBITaxonomy.taxonomy)
 end
 
 """
@@ -33,5 +33,5 @@ Returns a subset of the names table for all names under a number of multiple
 NCBI divisions.
 """
 function namefilter(division::Vector{Symbol})
-    return namefilter(filter(r -> r.division_code in division, NCBITaxonomy.taxonomy))
+    return filter(r -> r.division_code in division, NCBITaxonomy.taxonomy)
 end
\ No newline at end of file

From 518faa955c06bc51b98e7f55010ecce7feb295a6 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Timoth=C3=A9e=20Poisot?= <timothee.poisot@umontreal.ca>
Date: Sat, 4 Mar 2023 17:36:08 -0500
Subject: [PATCH 12/25] =?UTF-8?q?=F0=9F=90=9B=20index=20by=20tuple?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 src/lineage/rank.jl | 2 +-
 src/taxon.jl        | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/lineage/rank.jl b/src/lineage/rank.jl
index a8bd880..6426301 100644
--- a/src/lineage/rank.jl
+++ b/src/lineage/rank.jl
@@ -4,5 +4,5 @@
 Returns the rank of a taxon.
 """
 function rank(tax::NCBITaxon)
-    return only(NCBITaxonomy.groupedscinames[tax.id].rank)
+    return only(NCBITaxonomy.groupedscinames[(tax_id = tax.id,)].rank)
 end
diff --git a/src/taxon.jl b/src/taxon.jl
index fef410a..310aa73 100644
--- a/src/taxon.jl
+++ b/src/taxon.jl
@@ -7,8 +7,8 @@ the fastest way to get to a taxon, and is used internally by the tree traversal
 function taxon(id::Integer)
     (id in NCBITaxonomy.scinames.tax_id) || throw(IDNotFoundInBackbone(id))
     return NCBITaxon(
-        only(NCBITaxonomy.groupedscinames[id].name),
-        only(NCBITaxonomy.groupedscinames[id].tax_id)
+        only(NCBITaxonomy.groupedscinames[(tax_id = id, )].name),
+        only(NCBITaxonomy.groupedscinames[(tax_id = id, )].tax_id)
     )
 end
 

From 276de150ba8338c303ff18726f7f3f6e587a8f90 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Timoth=C3=A9e=20Poisot?= <timothee.poisot@umontreal.ca>
Date: Sat, 4 Mar 2023 17:44:20 -0500
Subject: [PATCH 13/25] =?UTF-8?q?=E2=9A=A1=20update=20some=20benchmarks?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 benchmark/benchmarks.jl | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/benchmark/benchmarks.jl b/benchmark/benchmarks.jl
index de3be02..65ed0e7 100644
--- a/benchmark/benchmarks.jl
+++ b/benchmark/benchmarks.jl
@@ -13,6 +13,9 @@ tax = [
 
 mf = mammalfilter()
 
+mf_t = mammalfilter(true)
+pf = primatefilter()
+
 const SUITE = BenchmarkGroup()
 
 # Construction of name finders
@@ -41,6 +44,14 @@ SUITE["taxon search"]["lowercase with finder"] =
 SUITE["taxon search"]["scientific with finder"] =
     @benchmarkable taxon(mf, "Sus scrofa"; preferscientific = true)
 
+SUITE["taxon search"]["pan - all defaults"] = @benchmarkable taxon("Pan")
+
+SUITE["taxon search"]["pan - mammal finder"] = @benchmarkable taxon(mf_t, "Pan")
+
+SUITE["taxon search"]["pan - primate finder"] = @benchmarkable taxon(pf, "Pan")
+
+SUITE["taxon search"]["pan - string annotation"] = @benchmarkable ncbi"Pan"
+
 # Ability to traverse a tree
 
 SUITE["traversal"] = BenchmarkGroup(["search", "tree traversal"])

From 1e8170602e5b835da0bc23cba81165f79f89ba5b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Timoth=C3=A9e=20Poisot?= <timothee.poisot@umontreal.ca>
Date: Sat, 4 Mar 2023 17:59:49 -0500
Subject: [PATCH 14/25] =?UTF-8?q?=E2=9A=A1=20update=20the=20speed=20of=20n?=
 =?UTF-8?q?amefinders?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 src/namefilters/namefilter.jl | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/namefilters/namefilter.jl b/src/namefilters/namefilter.jl
index 1e3a588..568cea2 100644
--- a/src/namefilters/namefilter.jl
+++ b/src/namefilters/namefilter.jl
@@ -23,7 +23,7 @@ end
 Returns a subset of the names table for all names under a given NCBI division.
 """
 function namefilter(division::Symbol)
-    return filter(r -> r.division_code == division, NCBITaxonomy.taxonomy)
+    return groupby(NCBITaxonomy.taxonomy, :division_code)[(division_code = division, )]
 end
 
 """
@@ -33,5 +33,5 @@ Returns a subset of the names table for all names under a number of multiple
 NCBI divisions.
 """
 function namefilter(division::Vector{Symbol})
-    return filter(r -> r.division_code in division, NCBITaxonomy.taxonomy)
+    return groupby(NCBITaxonomy.taxonomy, :division_code)[[(division_code = div,) for div in division]]
 end
\ No newline at end of file

From c749b0df8e8e54eafbbed8b4141a146ca4e75d23 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Timoth=C3=A9e=20Poisot?= <timothee.poisot@umontreal.ca>
Date: Sat, 4 Mar 2023 18:09:55 -0500
Subject: [PATCH 15/25] =?UTF-8?q?=E2=9A=A1=20minimize=20memory=20footprint?=
 =?UTF-8?q?=20of=20the=20table?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 src/read_taxonomy.jl | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/src/read_taxonomy.jl b/src/read_taxonomy.jl
index 347132a..bf1e9bd 100644
--- a/src/read_taxonomy.jl
+++ b/src/read_taxonomy.jl
@@ -29,6 +29,9 @@ function read_taxonomy(tables_path)
     division_table = nothing
     GC.gc()
 
+    select!(names_table, Not(:unique_name))
+    dropmissing!(names_table, [:rank, :parent_tax_id, :division_code])
+
     return names_table
 
 end
\ No newline at end of file

From 01a7ca7a5b8814c0095e5553b86cd343b6cb2135 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Timoth=C3=A9e=20Poisot?= <timothee.poisot@umontreal.ca>
Date: Sat, 4 Mar 2023 18:15:32 -0500
Subject: [PATCH 16/25] =?UTF-8?q?=F0=9F=90=9B=20make=20taxon=20work=20with?=
 =?UTF-8?q?=20abstract=20dfs?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 src/taxon.jl | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/src/taxon.jl b/src/taxon.jl
index 310aa73..e3a0351 100644
--- a/src/taxon.jl
+++ b/src/taxon.jl
@@ -7,8 +7,8 @@ the fastest way to get to a taxon, and is used internally by the tree traversal
 function taxon(id::Integer)
     (id in NCBITaxonomy.scinames.tax_id) || throw(IDNotFoundInBackbone(id))
     return NCBITaxon(
-        only(NCBITaxonomy.groupedscinames[(tax_id = id, )].name),
-        only(NCBITaxonomy.groupedscinames[(tax_id = id, )].tax_id)
+        only(NCBITaxonomy.groupedscinames[(tax_id = id,)].name),
+        only(NCBITaxonomy.groupedscinames[(tax_id = id,)].tax_id),
     )
 end
 
@@ -17,7 +17,7 @@ function _id_from_name(name::AbstractString; kwargs...)
 end
 
 function _id_from_name(
-    df::DataFrame,
+    df::T,
     name::AbstractString;
     strict::Bool = true,
     dist::Type{SD} = Levenshtein,
@@ -25,7 +25,7 @@ function _id_from_name(
     rank::Union{Nothing, Symbol} = nothing,
     preferscientific::Bool = false,
     onlysynonyms::Bool = false,
-) where {SD <: StringDistance}
+) where {SD <: StringDistance, T <: AbstractDataFrame}
     if !isnothing(rank)
         @assert rank ∈ unique(df.rank)
         df = df[findall(isequal(rank), df.rank), :]
@@ -94,7 +94,7 @@ taxon(name::AbstractString; kwargs...) = taxon(NCBITaxonomy.taxonomy, name; kwar
 Additional method for `taxon` with an extra dataframe argument, used most often
 with a `namefinder`. Accepts the usual `taxon` keyword arguments.
 """
-function taxon(df::DataFrame, name::AbstractString; kwargs...)
+function taxon(df::T, name::String; kwargs...) where {T <: AbstractDataFrame}
     id = _id_from_name(df, name; kwargs...)
     isnothing(id) && return nothing
     return taxon(id)

From 3ebcc5a82e9fe4c8f1336855f2c1fe0f315ad2ee Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Timoth=C3=A9e=20Poisot?= <timothee.poisot@umontreal.ca>
Date: Sat, 4 Mar 2023 18:16:45 -0500
Subject: [PATCH 17/25] =?UTF-8?q?=F0=9F=A6=86=20make=20sure=20name=20is=20?=
 =?UTF-8?q?a=20String?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 src/taxon.jl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/taxon.jl b/src/taxon.jl
index e3a0351..98dab01 100644
--- a/src/taxon.jl
+++ b/src/taxon.jl
@@ -18,7 +18,7 @@ end
 
 function _id_from_name(
     df::T,
-    name::AbstractString;
+    name::String;
     strict::Bool = true,
     dist::Type{SD} = Levenshtein,
     casesensitive::Bool = true,

From 46f8a52d77731342e1db67311ade6f5d6cf59da6 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Timoth=C3=A9e=20Poisot?= <timothee.poisot@umontreal.ca>
Date: Sat, 4 Mar 2023 18:18:33 -0500
Subject: [PATCH 18/25] =?UTF-8?q?=F0=9F=9A=A7=20allow=20missings?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 src/read_taxonomy.jl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/read_taxonomy.jl b/src/read_taxonomy.jl
index bf1e9bd..226d691 100644
--- a/src/read_taxonomy.jl
+++ b/src/read_taxonomy.jl
@@ -30,7 +30,7 @@ function read_taxonomy(tables_path)
     GC.gc()
 
     select!(names_table, Not(:unique_name))
-    dropmissing!(names_table, [:rank, :parent_tax_id, :division_code])
+    #dropmissing!(names_table, [:rank, :parent_tax_id, :division_code])
 
     return names_table
 

From 79e2dde08d4ea693a66ed860b5ae446b6abdfdba Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Timoth=C3=A9e=20Poisot?= <timothee.poisot@umontreal.ca>
Date: Sat, 4 Mar 2023 20:49:37 -0500
Subject: [PATCH 19/25] =?UTF-8?q?=F0=9F=90=9B=20return=20data=20frame=20fo?=
 =?UTF-8?q?r=20filtering=20on=20multiple=20divisions?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 src/namefilters/namefilter.jl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/namefilters/namefilter.jl b/src/namefilters/namefilter.jl
index 568cea2..2e43ac9 100644
--- a/src/namefilters/namefilter.jl
+++ b/src/namefilters/namefilter.jl
@@ -33,5 +33,5 @@ Returns a subset of the names table for all names under a number of multiple
 NCBI divisions.
 """
 function namefilter(division::Vector{Symbol})
-    return groupby(NCBITaxonomy.taxonomy, :division_code)[[(division_code = div,) for div in division]]
+    return vcat(groupby(NCBITaxonomy.taxonomy, :division_code)[[(division_code = div,) for div in division]]...)
 end
\ No newline at end of file

From de7c44132589d2f95f353650ad203a96a7f69d17 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Timoth=C3=A9e=20Poisot?= <timothee.poisot@umontreal.ca>
Date: Sat, 4 Mar 2023 20:56:44 -0500
Subject: [PATCH 20/25] =?UTF-8?q?=E2=9A=A1=20authority?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 src/NCBITaxonomy.jl      |  1 +
 src/utility/nametools.jl | 10 ++++++----
 2 files changed, 7 insertions(+), 4 deletions(-)

diff --git a/src/NCBITaxonomy.jl b/src/NCBITaxonomy.jl
index 160cf29..2d4bf8d 100644
--- a/src/NCBITaxonomy.jl
+++ b/src/NCBITaxonomy.jl
@@ -30,6 +30,7 @@ include("read_taxonomy.jl")
 taxonomy = read_taxonomy(tables_path)
 scinames = filter(r -> r.class == NCBITaxonomy.class_scientific_name, taxonomy)
 groupedscinames = groupby(scinames, :tax_id)
+groupedtaxonomy = groupby(taxonomy, :tax_id)
 
 include("taxon.jl")
 export taxon, @ncbi_str
diff --git a/src/utility/nametools.jl b/src/utility/nametools.jl
index 6d08f98..0e1df35 100644
--- a/src/utility/nametools.jl
+++ b/src/utility/nametools.jl
@@ -35,8 +35,10 @@ end
 This function will return `nothing` if no authority exist, and a string with the
 authority if found.
 """
-function authority(t::NCBITaxon)
-    x = NCBITaxonomy.taxonomy[findall(NCBITaxonomy.taxonomy.tax_id .== t.id), :]
-    p = findall(isequal(NCBITaxonomy.class_authority), x.class)
-    return length(p) == 0 ? nothing : first(x.name[p])
+function authority(tax::NCBITaxon)
+    auth = filter(r -> r.class == NCBITaxonomy.class_authority, NCBITaxonomy.groupedtaxonomy[(tax_id = tax.id, )])
+    if isempty(auth)
+        return nothing
+    end
+    return only(auth.name)
 end

From 2653d793e437ecd9779af2adec9a998558a61c59 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Timoth=C3=A9e=20Poisot?= <timothee.poisot@umontreal.ca>
Date: Sat, 4 Mar 2023 21:00:45 -0500
Subject: [PATCH 21/25] =?UTF-8?q?=E2=9A=A1=20nametools?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 src/utility/nametools.jl | 36 ++++++++++++++++++++++--------------
 1 file changed, 22 insertions(+), 14 deletions(-)

diff --git a/src/utility/nametools.jl b/src/utility/nametools.jl
index 0e1df35..b1c83e3 100644
--- a/src/utility/nametools.jl
+++ b/src/utility/nametools.jl
@@ -5,16 +5,16 @@ This function will return `nothing` if no vernacular name is known, and an array
 of names if found. It searches the "common name" and "genbank common name"
 category of the NCBI taxonomy name table.
 """
-function vernacular(t::NCBITaxon)
-    x = NCBITaxonomy.taxonomy[findall(NCBITaxonomy.taxonomy.tax_id .== t.id), :]
-    p = findall(
-        !isnothing,
-        indexin(
-            x.class,
-            [NCBITaxonomy.class_common_name, NCBITaxonomy.class_genbank_common_name],
-        ),
+function vernacular(tax::NCBITaxon)
+    vern = filter(
+        r -> r.class in
+        [NCBITaxonomy.class_common_name, NCBITaxonomy.class_genbank_common_name],
+        NCBITaxonomy.groupedtaxonomy[(tax_id = tax.id,)],
     )
-    return length(p) == 0 ? nothing : x.name[p]
+    if isempty(vern)
+        return nothing
+    end
+    return vern.name
 end
 
 """
@@ -23,10 +23,15 @@ end
 This function will return `nothing` if no synonyms exist, and an array of names
 if they do. It returns all of the
 """
-function synonyms(t::NCBITaxon)
-    x = NCBITaxonomy.taxonomy[findall(NCBITaxonomy.taxonomy.tax_id .== t.id), :]
-    p = findall(isequal(NCBITaxonomy.class_synonym), x.class)
-    return length(p) == 0 ? nothing : x.name[p]
+function synonyms(tax::NCBITaxon)
+    syno = filter(
+        r -> r.class == NCBITaxonomy.class_synonym,
+        NCBITaxonomy.groupedtaxonomy[(tax_id = tax.id,)],
+    )
+    if isempty(syno)
+        return nothing
+    end
+    return syno.name
 end
 
 """
@@ -36,7 +41,10 @@ This function will return `nothing` if no authority exist, and a string with the
 authority if found.
 """
 function authority(tax::NCBITaxon)
-    auth = filter(r -> r.class == NCBITaxonomy.class_authority, NCBITaxonomy.groupedtaxonomy[(tax_id = tax.id, )])
+    auth = filter(
+        r -> r.class == NCBITaxonomy.class_authority,
+        NCBITaxonomy.groupedtaxonomy[(tax_id = tax.id,)],
+    )
     if isempty(auth)
         return nothing
     end

From 9180a319c4f9a6e84934616ca7b6084d550199ec Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Timoth=C3=A9e=20Poisot?= <timothee.poisot@umontreal.ca>
Date: Sat, 4 Mar 2023 21:05:57 -0500
Subject: [PATCH 22/25] =?UTF-8?q?=F0=9F=A6=86=20remove=20all=20union{missi?=
 =?UTF-8?q?ng}=20from=20the=20taxonomy=20table?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 src/read_taxonomy.jl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/read_taxonomy.jl b/src/read_taxonomy.jl
index 226d691..bf1e9bd 100644
--- a/src/read_taxonomy.jl
+++ b/src/read_taxonomy.jl
@@ -30,7 +30,7 @@ function read_taxonomy(tables_path)
     GC.gc()
 
     select!(names_table, Not(:unique_name))
-    #dropmissing!(names_table, [:rank, :parent_tax_id, :division_code])
+    dropmissing!(names_table, [:rank, :parent_tax_id, :division_code])
 
     return names_table
 

From 4a04f3093e629f260bd44d0d64fd2b543f92be9d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Timoth=C3=A9e=20Poisot?= <timothee.poisot@umontreal.ca>
Date: Sat, 4 Mar 2023 21:23:20 -0500
Subject: [PATCH 23/25] =?UTF-8?q?=E2=9A=A1=20pre-split=20the=20division=20?=
 =?UTF-8?q?table?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 src/NCBITaxonomy.jl           | 1 +
 src/namefilters/namefilter.jl | 4 ++--
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/src/NCBITaxonomy.jl b/src/NCBITaxonomy.jl
index 2d4bf8d..ef7b91e 100644
--- a/src/NCBITaxonomy.jl
+++ b/src/NCBITaxonomy.jl
@@ -31,6 +31,7 @@ taxonomy = read_taxonomy(tables_path)
 scinames = filter(r -> r.class == NCBITaxonomy.class_scientific_name, taxonomy)
 groupedscinames = groupby(scinames, :tax_id)
 groupedtaxonomy = groupby(taxonomy, :tax_id)
+divisions = groupby(taxonomy, :division_code)
 
 include("taxon.jl")
 export taxon, @ncbi_str
diff --git a/src/namefilters/namefilter.jl b/src/namefilters/namefilter.jl
index 2e43ac9..ae7f298 100644
--- a/src/namefilters/namefilter.jl
+++ b/src/namefilters/namefilter.jl
@@ -23,7 +23,7 @@ end
 Returns a subset of the names table for all names under a given NCBI division.
 """
 function namefilter(division::Symbol)
-    return groupby(NCBITaxonomy.taxonomy, :division_code)[(division_code = division, )]
+    return NCBITaxonomy.divisions[(division_code = division,)]
 end
 
 """
@@ -33,5 +33,5 @@ Returns a subset of the names table for all names under a number of multiple
 NCBI divisions.
 """
 function namefilter(division::Vector{Symbol})
-    return vcat(groupby(NCBITaxonomy.taxonomy, :division_code)[[(division_code = div,) for div in division]]...)
+    return vcat(NCBITaxonomy.divisions[[(division_code = div,) for div in division]]...)
 end
\ No newline at end of file

From a1599a732055ea0f88eb555069d70bb9b21483f7 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Timoth=C3=A9e=20Poisot?= <timothee.poisot@umontreal.ca>
Date: Sat, 4 Mar 2023 21:31:05 -0500
Subject: [PATCH 24/25] =?UTF-8?q?=F0=9F=A6=86=20remove=20Abstract=20from?=
 =?UTF-8?q?=20strings?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 src/taxon.jl | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/taxon.jl b/src/taxon.jl
index 98dab01..df015d4 100644
--- a/src/taxon.jl
+++ b/src/taxon.jl
@@ -12,7 +12,7 @@ function taxon(id::Integer)
     )
 end
 
-function _id_from_name(name::AbstractString; kwargs...)
+function _id_from_name(name::String; kwargs...)
     return _id_from_name(NCBITaxonomy.taxonomy, name; kwargs...)
 end
 
@@ -86,7 +86,7 @@ The keywords are:
   - `onlysynonyms` (def. `false`) - limits the search to synonyms, which may be
     useful in case the taxonomy is particularly outdated
 """
-taxon(name::AbstractString; kwargs...) = taxon(NCBITaxonomy.taxonomy, name; kwargs...)
+taxon(name::String; kwargs...) = taxon(NCBITaxonomy.taxonomy, name; kwargs...)
 
 """
     taxon(df::DataFrame, name::AbstractString; kwargs...)

From 3eda3e77bbcf0836523ab798cb3a7324fe6fb88d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Timoth=C3=A9e=20Poisot?= <timothee.poisot@umontreal.ca>
Date: Sat, 4 Mar 2023 21:55:05 -0500
Subject: [PATCH 25/25] =?UTF-8?q?=E2=9A=A1=20taxon=20by=20id=20speedup?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 benchmark/benchmarks.jl |  2 +
 src/NCBITaxonomy.jl     |  1 +
 src/taxon.jl            | 89 ++++++++++++++++++++++-------------------
 3 files changed, 50 insertions(+), 42 deletions(-)

diff --git a/benchmark/benchmarks.jl b/benchmark/benchmarks.jl
index 65ed0e7..d2cef21 100644
--- a/benchmark/benchmarks.jl
+++ b/benchmark/benchmarks.jl
@@ -28,6 +28,8 @@ SUITE["name finders"]["mammals (inclusive)"] = @benchmarkable mammalfilter(true)
 
 SUITE["name finders"]["phage"] = @benchmarkable phagefilter()
 
+SUITE["name finders"]["descendants of Diplectanidae"] = @benchmarkable descendantsfilter(ncbi"Diplectanidae")
+
 # Ability to locate taxa
 
 SUITE["taxon search"] = BenchmarkGroup(["namefinding", "search"])
diff --git a/src/NCBITaxonomy.jl b/src/NCBITaxonomy.jl
index ef7b91e..c1e9c5d 100644
--- a/src/NCBITaxonomy.jl
+++ b/src/NCBITaxonomy.jl
@@ -32,6 +32,7 @@ scinames = filter(r -> r.class == NCBITaxonomy.class_scientific_name, taxonomy)
 groupedscinames = groupby(scinames, :tax_id)
 groupedtaxonomy = groupby(taxonomy, :tax_id)
 divisions = groupby(taxonomy, :division_code)
+ranks = groupby(taxonomy, :rank)
 
 include("taxon.jl")
 export taxon, @ncbi_str
diff --git a/src/taxon.jl b/src/taxon.jl
index df015d4..0cb3775 100644
--- a/src/taxon.jl
+++ b/src/taxon.jl
@@ -5,17 +5,50 @@ Performs a search in the entire taxonomy backbone based on a known ID. This is
 the fastest way to get to a taxon, and is used internally by the tree traversal methods.
 """
 function taxon(id::Integer)
-    (id in NCBITaxonomy.scinames.tax_id) || throw(IDNotFoundInBackbone(id))
-    return NCBITaxon(
-        only(NCBITaxonomy.groupedscinames[(tax_id = id,)].name),
-        only(NCBITaxonomy.groupedscinames[(tax_id = id,)].tax_id),
-    )
+    try
+        m = only(NCBITaxonomy.groupedscinames[(tax_id = id,)])
+        return NCBITaxon(
+            m.name,
+            m.tax_id,
+        )
+    catch
+        throw(IDNotFoundInBackbone(id))
+    end
 end
 
 function _id_from_name(name::String; kwargs...)
     return _id_from_name(NCBITaxonomy.taxonomy, name; kwargs...)
 end
 
+function _strict_matches(
+    df::T,
+    name::String,
+    casesensitive::Bool,
+) where {T <: AbstractDataFrame}
+    positions = if casesensitive
+        findall(==(name), df.name)
+    else
+        findall(==(lowercase(name)), df.lowercase)
+    end
+    isempty(positions) && return nothing
+    return positions
+end
+
+function _fuzzy_matches(
+    df::T,
+    name::String,
+    casesensitive::Bool,
+    dist::Type{SD}
+) where {T <: AbstractDataFrame, SD <: StringDistance}
+    positions = if casesensitive
+        last(findnearest(name, df.name, dist()))
+    else
+        last(findnearest(lowercase(name), df.lowercase, dist()))
+    end
+    isempty(positions) && return nothing
+    return positions
+end
+
 function _id_from_name(
     df::T,
     name::String;
@@ -24,46 +57,18 @@ function _id_from_name(
     casesensitive::Bool = true,
     rank::Union{Nothing, Symbol} = nothing,
     preferscientific::Bool = false,
-    onlysynonyms::Bool = false,
 ) where {SD <: StringDistance, T <: AbstractDataFrame}
-    if !isnothing(rank)
-        @assert rank ∈ unique(df.rank)
-        df = df[findall(isequal(rank), df.rank), :]
-    end
-    if onlysynonyms
-        df = df[findall(isequal(NCBITaxonomy.class_synonym), df.class), :]
-    end
-    if strict
-        positions = if casesensitive
-            findall(isequal(name), df.name)
-        else
-            findall(isequal(lowercase(name)), df.lowercase)
-        end
-        # If the array is empty, we throw the "no name" error
-        isempty(positions) && throw(NameHasNoDirectMatch(name))
-        # If the array has a single element, this is the ticket
-        length(positions) == 1 && return df.tax_id[first(positions)]
-        # If we prefer scientific names, we can filter with this
-        if preferscientific
-            if NCBITaxonomy.class_scientific_name in df.class[positions]
-                ids = df.tax_id[positions][findall(
-                    isequal(NCBITaxonomy.class_scientific_name),
-                    df.class[positions],
-                )]
-                if length(ids) == 1
-                    return first(ids)
-                else
-                    throw(NameHasMultipleMatches(name, taxon.(ids)))
-                end
-            end
-        end
-        # If neither of these are satisfied, the name has multiple matches and we throw the appropriate error
-        taxa = taxon.(df.tax_id[positions])
-        throw(NameHasMultipleMatches(name, taxa))
+    # Perform the correct search
+    positions = if strict
+        _strict_matches(df, name, casesensitive)
     else
-        correct_name, position = findnearest(name, df.name, dist())
-        return df.tax_id[position]
+        _fuzzy_matches(df, name, casesensitive, dist)
     end
+    length(positions) == 1 && return df.tax_id[only(positions)]
+    isempty(positions) && throw(NameHasNoDirectMatch(name))
+    @info df[positions,:]
+    #taxa = taxon.(df.tax_id[positions])
+    #throw(NameHasMultipleMatches(name, taxa))
 end
 
 """