PoisotLab · tpoisot · Mar 4, 2023 · Mar 4, 2023 · Mar 4, 2023 · Mar 4, 2023
diff --git a/Project.toml b/Project.toml
@@ -7,8 +7,10 @@ version = "0.4.1"
 AbstractTrees = "1520ce14-60c1-5f80-bbc7-55ef81b5835c"
 Arrow = "69666777-d1a9-59fb-9406-91d4454c9d45"
 DataFrames = "a93c6f00-e57d-5684-b7b6-d8193f3e46c0"
+Downloads = "f43a241f-c20a-4ad4-852c-f6b1247861c6"
 GZip = "92fee26a-97fe-5a0c-ad85-20a5f3185b63"
 MD5 = "6ac74813-4b46-53a4-afec-0b5dc9d7885c"
+Revise = "295af30f-e4ad-537b-8983-00126c2a3abe"
 StringDistances = "88034a9c-02f8-509d-84a9-84ec65e18404"
 Tar = "a4e569a6-e804-4fa4-b0f3-eef7a1d5b13e"
 

diff --git a/benchmark/benchmarks.jl b/benchmark/benchmarks.jl
@@ -28,6 +28,8 @@ SUITE["name finders"]["mammals (inclusive)"] = @benchmarkable mammalfilter(true)
 
 SUITE["name finders"]["phage"] = @benchmarkable phagefilter()
 
+SUITE["name finders"]["descendants of Diplectanidae"] = @benchmarkable descendantsfilter(ncbi"Diplectanidae")
+
 # Ability to locate taxa
 
 SUITE["taxon search"] = BenchmarkGroup(["namefinding", "search"])

diff --git a/deps/build.jl b/deps/build.jl
@@ -3,71 +3,25 @@ import GZip
 import Tar
 import Arrow
 import DataFrames
+import Downloads
 
-# URL for the taxonomy dump
-const ncbi_ftp = "https://ftp.ncbi.nlm.nih.gov/pub/taxonomy/new_taxdump/"
-const archive = ncbi_ftp * "new_taxdump.tar.gz"
-const checksum = archive * ".md5"
-
-if !haskey(ENV, "NCBITAXONOMY_PATH")
-    @warn """
-    The environmental variable NCBITAXONOMY_PATH is not set, so the tables will
-    be stored in your home directory. This is not ideal, and you really should set
-    the NCBITAXONOMY_PATH.
-
-    This can be done by adding `ENV["NCBITAXONOMY_PATH"]` in your Julia startup
-    file. The path will be created automatically if it does not exist.
-    """
-end
-const taxpath = get(ENV, "NCBITAXONOMY_PATH", joinpath(homedir(), "NCBITaxonomy"))
-ispath(taxpath) || mkpath(taxpath)
-
-chk_file = download(checksum)
-chk = split(readlines(chk_file)[1], " ")[1]
-@info "Checksum of the most recent NCBI taxonomy: $(chk)"
-
-function download_dump(url, chk, dest)
-    @info "Downloading the taxonomy data from $(url)"
-    if ispath(joinpath(taxpath, dest))
-        @info "Removing the previous version of the taxonomy"
-        rm(joinpath(taxpath, dest); force=true, recursive=true)
-        mkpath(joinpath(taxpath, dest))
-    else
-        mkpath(joinpath(taxpath, dest))
-    end
-    arc = download(url)
-    vrf = bytes2hex(open(MD5.md5, arc))
-    vrf == chk || throw(ErrorException("Wrong checksum for the NCBI taxonomy archive file - unable to download"))
-    write(joinpath(taxpath, ".checksum"), vrf)
-    Tar.extract(GZip.open(arc), joinpath(taxpath, dest))
-end
-
-# The next block is about making sure that we don't download something that has
-# not changed when we build the package. The taxonomy dump is not gigantic, but
-# there is no need to get it over and over again.
-if !isfile(joinpath(taxpath, ".checksum"))
-    @info "No local taxonomy checksum found"
-    download_dump(archive, chk, "dump")
-else
-    local_chk = readline(joinpath(taxpath, ".checksum"))
-    if local_chk != chk
-        @info "Local and remote checksum do not match"
-        download_dump(archive, chk, "dump")
-    else
-        @info "Local taxonomy dump ($(local_chk)) is up to date"
-    end
-end
+# There are two things we need for the build process: the types, and the
+# location of the files
+include(joinpath(@__DIR__, "hydrate.jl"))
+include(joinpath(@__DIR__, "..", "src", "types.jl"))
+include(joinpath(@__DIR__, "..", "src", "local_archive_path.jl"))
 
-@info "Materializing the taxonomy"
+# These steps are meant to download and unpack the taxonomy as needed, which is
+# to say as unfrequently as possible
+remote_info = _remote_archive_path()
+local_path = _local_archive_path()
+remote_checksum = _get_current_remote_checksum(local_path, remote_info)
+local_archive = _unpack_if_needed(local_path, remote_info, remote_checksum)
 
 # We will store the tables used by the package in the tables folder
-tables = joinpath(taxpath, "tables")
-ispath(tables) || mkpath(tables)
+tables_path = _create_or_get_tables_path(local_path)
 
 # Utility functions
-
-include(joinpath(@__DIR__, "..", "src", "types.jl"))
-
 function _class_to_enum(c::T) where {T <: String}
     c = replace(c, " " => "_")
     c = replace(c, "-" => "_")
@@ -84,8 +38,8 @@ issue.
 function _materialize_data(::Type{T}, v) where {T}
     if v != ""
         T <: Number && return parse(T, v)
-        T <: Union{Bool,Missing} && return parse(Bool, v)
-        T <: Union{Int,Missing} && return parse(Int, v)
+        T <: Union{Bool, Missing} && return parse(Bool, v)
+        T <: Union{Int, Missing} && return parse(Int, v)
         T <: Symbol && return Symbol(v)
         T <: NCBINameClass && return _class_to_enum(v)
         return v
@@ -107,44 +61,53 @@ function _build_arrow_file(df, dump_file)
     return df
 end
 
-# Get the data
-
-@info "Building the names file"
-ncbi_names_file_in = joinpath(taxpath, "dump", "names.dmp")
-ncbi_names_file_out = joinpath(taxpath, "tables", "names.arrow")
-ncbi_names = DataFrames.DataFrame(tax_id=Int[], name=String[], unique_name=Union{String,Missing}[], class=NCBINameClass[])
+# Get the data for the names
+ncbi_names_file_in = joinpath(local_path, "dump", "names.dmp")
+ncbi_names_file_out = joinpath(tables_path, "names.arrow")
+ncbi_names = DataFrames.DataFrame(;
+    tax_id = Int[],
+    name = String[],
+    unique_name = Union{String, Missing}[],
+    class = NCBINameClass[],
+)
 names_df = _build_arrow_file(ncbi_names, ncbi_names_file_in)
 names_df.class = Int.(names_df.class)
 Arrow.write(ncbi_names_file_out, names_df)
 names_df = nothing
 GC.gc()
 
-@info "Building the division file"
-ncbi_division_file_in = joinpath(taxpath, "dump", "division.dmp")
-ncbi_division_file_out = joinpath(taxpath, "tables", "division.arrow")
-ncbi_division = DataFrames.DataFrame(division_id=Int[], division_code=Symbol[], division_name=Symbol[], comments=Union{String,Missing}[])
+ncbi_division_file_in = joinpath(local_path, "dump", "division.dmp")
+ncbi_division_file_out = joinpath(tables_path, "division.arrow")
+ncbi_division = DataFrames.DataFrame(;
+    division_id = Int[],
+    division_code = Symbol[],
+    division_name = Symbol[],
+    comments = Union{String, Missing}[],
+)
 division_df = _build_arrow_file(ncbi_division, ncbi_division_file_in)
 Arrow.write(ncbi_division_file_out, division_df)
 division_df = nothing
 GC.gc()
 
-@info "Building the nodes file"
-ncbi_nodes_file_in = joinpath(taxpath, "dump", "nodes.dmp")
-ncbi_nodes_file_out = joinpath(taxpath, "tables", "nodes.arrow")
-ncbi_nodes = DataFrames.DataFrame(
-    tax_id=Int[], parent_tax_id=Int[],
-    rank=Symbol[],
-    embl=Union{String,Missing}[],
-    division_id=Int[], inherited_div=Union{Bool,Missing}[],
-    genetic_code_id=Int[], inherited_gc=Union{Bool,Missing}[], 
-    mitochondrial_genetic_code_id=Union{Int,Missing}[], inherited_mgc=Union{Bool,Missing}[],
-    genbank_hidden=Union{Bool,Missing}[],
-    hidden_subtree=Union{Bool,Missing}[],
-    comments=Union{String,Missing}[],
-    plastid_genetic_code_id=Union{Int,Missing}[], inherited_pgc=Union{Bool,Missing}[],
-    specified_species=Union{Bool,Missing}[],
-    hydrogenosome_code_id=Union{Int,Missing}[], inherited_hgc=Union{Bool,Missing}[]
-    )
+ncbi_nodes_file_in = joinpath(local_path, "dump", "nodes.dmp")
+ncbi_nodes_file_out = joinpath(tables_path, "nodes.arrow")
+ncbi_nodes = DataFrames.DataFrame(;
+    tax_id = Int[], parent_tax_id = Int[],
+    rank = Symbol[],
+    embl = Union{String, Missing}[],
+    division_id = Int[], inherited_div = Union{Bool, Missing}[],
+    genetic_code_id = Int[], inherited_gc = Union{Bool, Missing}[],
+    mitochondrial_genetic_code_id = Union{Int, Missing}[],
+    inherited_mgc = Union{Bool, Missing}[],
+    genbank_hidden = Union{Bool, Missing}[],
+    hidden_subtree = Union{Bool, Missing}[],
+    comments = Union{String, Missing}[],
+    plastid_genetic_code_id = Union{Int, Missing}[],
+    inherited_pgc = Union{Bool, Missing}[],
+    specified_species = Union{Bool, Missing}[],
+    hydrogenosome_code_id = Union{Int, Missing}[],
+    inherited_hgc = Union{Bool, Missing}[],
+)
 nodes_df = _build_arrow_file(ncbi_nodes, ncbi_nodes_file_in)
 Arrow.write(ncbi_nodes_file_out, nodes_df)
 nodes_df = nothing

diff --git a/deps/hydrate.jl b/deps/hydrate.jl
@@ -0,0 +1,57 @@
+function _remote_archive_path(;
+    ncbi_ftp = "https://ftp.ncbi.nlm.nih.gov/pub/taxonomy/new_taxdump/",
+)::NamedTuple{
+    (:url, :archive, :checksum),
+    Tuple{String, String, String},
+}
+    return (
+        url = ncbi_ftp,
+        archive = "new_taxdump.tar.gz",
+        checksum = "new_taxdump.tar.gz.md5",
+    )
+end
+
+function _get_current_remote_checksum(local_path, remote_info)
+    chk_file = Downloads.download(
+        remote_info.url * remote_info.checksum,
+        joinpath(local_path, ".checksum.remote"),
+    )
+    return split(readlines(chk_file)[1], " ")[1]
+end
+
+function _download_archive(local_path, remote_info)
+    Downloads.download(
+        remote_info.url * remote_info.archive,
+        joinpath(local_path, remote_info.archive),
+    )
+    return joinpath(local_path, remote_info.archive)
+end
+
+function _unpack_if_needed(local_path, remote_info, remote_checksum)
+    local_archive = joinpath(local_path, remote_info.archive)
+    need_update = false
+    if ~isfile(local_archive)
+        @warn "There is no local taxonomy dump, we will download one"
+        local_archive = _download_archive(local_path, remote_info)
+        need_update = true
+    end
+    local_checksum = bytes2hex(open(MD5.md5, local_archive))
+    if local_checksum != remote_checksum
+        @warn "The checksum of the taxonomy dump does not match the remote"
+        local_archive = _download_archive(local_path, remote_info)
+        local_checksum = bytes2hex(open(MD5.md5, local_archive))
+        need_update = true
+    end
+    if need_update
+        @warn "We are unpacking the local taxonomy dump"
+        rm(joinpath(local_path, "dump"); force=true)
+        Tar.extract(GZip.open(local_archive), joinpath(local_path, "dump"))
+    end
+    return joinpath(local_path, "dump")
+end
+
+function _create_or_get_tables_path(local_path)
+    tables_path = joinpath(local_path, "tables")
+    ispath(tables_path) || mkpath(tables_path)
+    return tables_path
+end
diff --git a/src/NCBITaxonomy.jl b/src/NCBITaxonomy.jl
@@ -1,59 +1,38 @@
 module NCBITaxonomy
 using DataFrames
-using Arrow
+import Arrow
 using StringDistances
 using AbstractTrees
 
-if !haskey(ENV, "NCBITAXONOMY_PATH")
-    @warn """
-    The environmental variable NCBITAXONOMY_PATH is not set, so the tables will
-    be stored in the package path. This is not ideal, and you really should set
-    the NCBITAXONOMY_PATH.
-
-    This can be done by adding `ENV["NCBITAXONOMY_PATH"]` in your Julia startup
-    file. The path will be created automatically if it does not exist.
-    """
-end
-const taxpath = get(ENV, "NCBITAXONOMY_PATH", joinpath(homedir(), "NCBITaxonomy"))
-ispath(taxpath) || mkpath(taxpath)
+# Point to where the taxonomy is located
+include("local_archive_path.jl")
+tables_path = _create_or_get_tables_path(_local_archive_path())
 
 function __init__()
-    name_date = mtime(joinpath(taxpath, "tables", "names.arrow"))
-    return time() - name_date >= 2.6e+6 && @warn(
-        "Your local taxonomy version is over 30 days old, we recommend using `] build NCBITaxonomy` to get the most recent version."
-    )
+    name_date = mtime(joinpath(tables_path, "names.arrow"))
+    over_30_days = time() - name_date >= 2.6e+6
+    if over_30_days
+        @warn(
+            "Your local taxonomy version is over 30 days old, we recommend using `] build NCBITaxonomy` to get the most recent version."
+        )
+    end
+    return nothing
 end
 
 include("types.jl")
-export NCBITaxon, NCBINameClass, IDNotFoundInBackbone
+export NCBITaxon, NCBINameClass
 
 include("exceptions.jl")
-export NameHasNoDirectMatch, NameHasMultipleMatches
-
-names_table = DataFrame(Arrow.Table(joinpath(taxpath, "tables", "names.arrow")))
-names_table.class = NCBINameClass.(names_table.class)
-names_table.lowercase = lowercase.(names_table.name)
-
-division_table = DataFrame(Arrow.Table(joinpath(taxpath, "tables", "division.arrow")))
-select!(division_table, Not(:comments))
-
-nodes_table = DataFrame(Arrow.Table(joinpath(taxpath, "tables", "nodes.arrow")))
-select!(nodes_table, Not(r"inherited_"))
-select!(nodes_table, Not(r"_code_id"))
-select!(nodes_table, Not(:genbank_hidden))
-select!(nodes_table, Not(:hidden_subtree))
-select!(nodes_table, Not(:comments))
-select!(nodes_table, Not(:embl))
-
-nodes_table = innerjoin(nodes_table, division_table; on = :division_id)
-select!(nodes_table, Not(:division_id))
-
-names_table = leftjoin(
-    names_table,
-    unique(select(nodes_table, [:tax_id, :rank, :parent_tax_id]));
-    on = :tax_id,
-)
-scinames_table = names_table[findall(names_table.class .== class_scientific_name), :]
+export NameHasNoDirectMatch, NameHasMultipleMatches, IDNotFoundInBackbone
+
+# We load the core file with all we need in it
+include("read_taxonomy.jl")
+taxonomy = read_taxonomy(tables_path)
+scinames = filter(r -> r.class == NCBITaxonomy.class_scientific_name, taxonomy)
+groupedscinames = groupby(scinames, :tax_id)
+groupedtaxonomy = groupby(taxonomy, :tax_id)
+divisions = groupby(taxonomy, :division_code)
+ranks = groupby(taxonomy, :rank)
 
 include("taxon.jl")
 export taxon, @ncbi_str

diff --git a/src/interfaces/abstracttrees.jl b/src/interfaces/abstracttrees.jl
@@ -4,13 +4,13 @@
 Returns the children of a taxon.
 """
 function AbstractTrees.children(tax::NCBITaxon)
-    positions = findall(isequal(tax.id), NCBITaxonomy.scinames_table.parent_tax_id)
+    positions = findall(isequal(tax.id), NCBITaxonomy.scinames.parent_tax_id)
     if ~isempty(positions)
         list_of_children = Vector{NCBITaxon}(undef, length(positions))
         for i in axes(positions, 1)
             list_of_children[i] = NCBITaxon(
-                NCBITaxonomy.scinames_table.name[positions[i]],
-                NCBITaxonomy.scinames_table.tax_id[positions[i]],
+                NCBITaxonomy.scinames.name[positions[i]],
+                NCBITaxonomy.scinames.tax_id[positions[i]],
             )
         end
         return list_of_children
@@ -25,15 +25,15 @@ end
 Returns the taxon from which the argument taxon is descended.
 """
 function AbstractTrees.parent(tax::NCBITaxon)
-    position = findfirst(isequal(tax.id), NCBITaxonomy.scinames_table.tax_id)
+    position = findfirst(isequal(tax.id), NCBITaxonomy.scinames.tax_id)
     if ~isnothing(position)
         parent_position = findfirst(
-            isequal(NCBITaxonomy.scinames_table.parent_tax_id[position]),
-            NCBITaxonomy.scinames_table.tax_id,
+            isequal(NCBITaxonomy.scinames.parent_tax_id[position]),
+            NCBITaxonomy.scinames.tax_id,
         )
         return NCBITaxon(
-            NCBITaxonomy.scinames_table.name[parent_position],
-            NCBITaxonomy.scinames_table.tax_id[parent_position],
+            NCBITaxonomy.scinames.name[parent_position],
+            NCBITaxonomy.scinames.tax_id[parent_position],
         )
     else
         return nothing

diff --git a/src/lineage/rank.jl b/src/lineage/rank.jl
@@ -4,6 +4,5 @@
 Returns the rank of a taxon.
 """
 function rank(tax::NCBITaxon)
-    position = findfirst(isequal(tax.id), NCBITaxonomy.nodes_table.tax_id)
-    return NCBITaxonomy.nodes_table.rank[position]
+    return only(NCBITaxonomy.groupedscinames[(tax_id = tax.id,)].rank)
 end