From 57d3c10297c5a05e9cf13941275dad695cddccde Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Timoth=C3=A9e=20Poisot?= Date: Sat, 4 Mar 2023 15:59:07 -0500 Subject: [PATCH 01/25] =?UTF-8?q?=F0=9F=9B=91=20sanitize=20the=20download/?= =?UTF-8?q?extract=20step?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- deps/build.jl | 64 ++++++------------------------------- src/NCBITaxonomy.jl | 25 ++++++--------- src/hydrate.jl | 78 +++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 96 insertions(+), 71 deletions(-) create mode 100644 src/hydrate.jl diff --git a/deps/build.jl b/deps/build.jl index b914cde..2da9dcc 100644 --- a/deps/build.jl +++ b/deps/build.jl @@ -3,66 +3,20 @@ import GZip import Tar import Arrow import DataFrames +import Downloads -# URL for the taxonomy dump -const ncbi_ftp = "https://ftp.ncbi.nlm.nih.gov/pub/taxonomy/new_taxdump/" -const archive = ncbi_ftp * "new_taxdump.tar.gz" -const checksum = archive * ".md5" - -if !haskey(ENV, "NCBITAXONOMY_PATH") - @warn """ - The environmental variable NCBITAXONOMY_PATH is not set, so the tables will - be stored in your home directory. This is not ideal, and you really should set - the NCBITAXONOMY_PATH. - - This can be done by adding `ENV["NCBITAXONOMY_PATH"]` in your Julia startup - file. The path will be created automatically if it does not exist. - """ -end -const taxpath = get(ENV, "NCBITAXONOMY_PATH", joinpath(homedir(), "NCBITaxonomy")) -ispath(taxpath) || mkpath(taxpath) - -chk_file = download(checksum) -chk = split(readlines(chk_file)[1], " ")[1] -@info "Checksum of the most recent NCBI taxonomy: $(chk)" - -function download_dump(url, chk, dest) - @info "Downloading the taxonomy data from $(url)" - if ispath(joinpath(taxpath, dest)) - @info "Removing the previous version of the taxonomy" - rm(joinpath(taxpath, dest); force=true, recursive=true) - mkpath(joinpath(taxpath, dest)) - else - mkpath(joinpath(taxpath, dest)) - end - arc = download(url) - vrf = bytes2hex(open(MD5.md5, arc)) - vrf == chk || throw(ErrorException("Wrong checksum for the NCBI taxonomy archive file - unable to download")) - write(joinpath(taxpath, ".checksum"), vrf) - Tar.extract(GZip.open(arc), joinpath(taxpath, dest)) -end - -# The next block is about making sure that we don't download something that has -# not changed when we build the package. The taxonomy dump is not gigantic, but -# there is no need to get it over and over again. -if !isfile(joinpath(taxpath, ".checksum")) - @info "No local taxonomy checksum found" - download_dump(archive, chk, "dump") -else - local_chk = readline(joinpath(taxpath, ".checksum")) - if local_chk != chk - @info "Local and remote checksum do not match" - download_dump(archive, chk, "dump") - else - @info "Local taxonomy dump ($(local_chk)) is up to date" - end -end +# These steps are meant to download and unpack the taxonomy as needed, which is +# to say as unfrequently as possible +include(joinpath(@__DIR__, "..", "src", "hydrate.jl")) +remote_info = _remote_archive_path() +local_path = _local_archive_path() +remote_checksum = _get_current_remote_checksum(local_path, remote_info) +local_archive = _unpack_if_needed(local_path, remote_info, remote_checksum) @info "Materializing the taxonomy" # We will store the tables used by the package in the tables folder -tables = joinpath(taxpath, "tables") -ispath(tables) || mkpath(tables) +tables_path = _create_or_get_tables_path(local_path) # Utility functions diff --git a/src/NCBITaxonomy.jl b/src/NCBITaxonomy.jl index 5d01411..fc27fbe 100644 --- a/src/NCBITaxonomy.jl +++ b/src/NCBITaxonomy.jl @@ -4,24 +4,17 @@ using Arrow using StringDistances using AbstractTrees -if !haskey(ENV, "NCBITAXONOMY_PATH") - @warn """ - The environmental variable NCBITAXONOMY_PATH is not set, so the tables will - be stored in the package path. This is not ideal, and you really should set - the NCBITAXONOMY_PATH. - - This can be done by adding `ENV["NCBITAXONOMY_PATH"]` in your Julia startup - file. The path will be created automatically if it does not exist. - """ -end -const taxpath = get(ENV, "NCBITAXONOMY_PATH", joinpath(homedir(), "NCBITaxonomy")) -ispath(taxpath) || mkpath(taxpath) +# Point to where the taxonomy is located +include("hydrate.jl") +local_path = _local_archive_path() function __init__() - name_date = mtime(joinpath(taxpath, "tables", "names.arrow")) - return time() - name_date >= 2.6e+6 && @warn( - "Your local taxonomy version is over 30 days old, we recommend using `] build NCBITaxonomy` to get the most recent version." - ) + name_date = mtime(joinpath(local_path, "tables", "names.arrow")) + over_30_days = time() - name_date >= 2.6e+6 + if over_30_days + @warn("Your local taxonomy version is over 30 days old, we recommend using `] build NCBITaxonomy` to get the most recent version.") + end + return nothing end include("types.jl") diff --git a/src/hydrate.jl b/src/hydrate.jl new file mode 100644 index 0000000..0cf76b6 --- /dev/null +++ b/src/hydrate.jl @@ -0,0 +1,78 @@ +function _remote_archive_path(; + ncbi_ftp = "https://ftp.ncbi.nlm.nih.gov/pub/taxonomy/new_taxdump/", +)::NamedTuple{ + (:url, :archive, :checksum), + Tuple{String, String, String}, +} + return ( + url = ncbi_ftp, + archive = "new_taxdump.tar.gz", + checksum = "new_taxdump.tar.gz.md5", + ) +end + +""" + _local_archive_path() + +Returns the path where the taxonomy dump is stored, and throws a warning if the +path is not set as an environmental variable. This is used during the build step +*and* during the initial startup of the package. +""" +function _local_archive_path()::String + if !haskey(ENV, "NCBITAXONOMY_PATH") + @warn """ + The environmental variable NCBITAXONOMY_PATH is not set, so the tables will + be stored in your home directory. This is not ideal, and you really should set + the NCBITAXONOMY_PATH. + + This can be done by adding `ENV["NCBITAXONOMY_PATH"]` in your Julia startup + file. The path will be created automatically if it does not exist. + """ + end + taxpath = get(ENV, "NCBITAXONOMY_PATH", joinpath(homedir(), "NCBITaxonomy")) + ispath(taxpath) || mkpath(taxpath) + return taxpath +end + +function _get_current_remote_checksum(local_path, remote_info) + chk_file = Downloads.download( + remote_info.url * remote_info.checksum, + joinpath(local_path, ".checksum.remote"), + ) + return split(readlines(chk_file)[1], " ")[1] +end + +function _download_archive(local_path, remote_info) + Downloads.download( + remote_info.url * remote_info.archive, + joinpath(local_path, remote_info.archive), + ) + return joinpath(local_path, remote_info.archive) +end + +function _unpack_if_needed(local_path, remote_info, remote_checksum) + local_archive = joinpath(local_path, remote_info.archive) + need_update = false + if ~isfile(local_archive) + local_archive = _download_archive(local_path, remote_info) + local_checksum = bytes2hex(open(MD5.md5, local_archive)) + need_update = true + else + local_checksum = bytes2hex(open(MD5.md5, local_archive)) + end + if local_checksum != remote_checksum + local_archive = _download_archive(local_path, remote_info) + local_checksum = bytes2hex(open(MD5.md5, local_archive)) + need_update = true + end + if need_update + Tar.extract(GZip.open(local_archive), joinpath(local_path, "dump")) + end + return joinpath(local_path, "dump") +end + +function _create_or_get_tables_path(local_path) + tables_path = joinpath(local_path, "tables") + ispath(tables_path) || mkpath(tables_path) + return tables_path +end \ No newline at end of file From c854acaaa43aef84ae9cca623618b8626b0a3e55 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Timoth=C3=A9e=20Poisot?= Date: Sat, 4 Mar 2023 16:19:43 -0500 Subject: [PATCH 02/25] =?UTF-8?q?=F0=9F=9A=AE=20only=20check=20the=20local?= =?UTF-8?q?=20checksum=20after=20making=20sure=20the=20archive=20is=20here?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- deps/build.jl | 78 ++++++++++++++++++++++++++++---------------------- src/hydrate.jl | 5 ++-- 2 files changed, 45 insertions(+), 38 deletions(-) diff --git a/deps/build.jl b/deps/build.jl index 2da9dcc..eb580b3 100644 --- a/deps/build.jl +++ b/deps/build.jl @@ -5,23 +5,22 @@ import Arrow import DataFrames import Downloads +# There are two things we need for the build process: the types, and the +# location of the files +include(joinpath(@__DIR__, "..", "src", "hydrate.jl")) +include(joinpath(@__DIR__, "..", "src", "types.jl")) + # These steps are meant to download and unpack the taxonomy as needed, which is # to say as unfrequently as possible -include(joinpath(@__DIR__, "..", "src", "hydrate.jl")) remote_info = _remote_archive_path() local_path = _local_archive_path() remote_checksum = _get_current_remote_checksum(local_path, remote_info) local_archive = _unpack_if_needed(local_path, remote_info, remote_checksum) -@info "Materializing the taxonomy" - # We will store the tables used by the package in the tables folder tables_path = _create_or_get_tables_path(local_path) # Utility functions - -include(joinpath(@__DIR__, "..", "src", "types.jl")) - function _class_to_enum(c::T) where {T <: String} c = replace(c, " " => "_") c = replace(c, "-" => "_") @@ -38,8 +37,8 @@ issue. function _materialize_data(::Type{T}, v) where {T} if v != "" T <: Number && return parse(T, v) - T <: Union{Bool,Missing} && return parse(Bool, v) - T <: Union{Int,Missing} && return parse(Int, v) + T <: Union{Bool, Missing} && return parse(Bool, v) + T <: Union{Int, Missing} && return parse(Int, v) T <: Symbol && return Symbol(v) T <: NCBINameClass && return _class_to_enum(v) return v @@ -61,44 +60,53 @@ function _build_arrow_file(df, dump_file) return df end -# Get the data - -@info "Building the names file" -ncbi_names_file_in = joinpath(taxpath, "dump", "names.dmp") -ncbi_names_file_out = joinpath(taxpath, "tables", "names.arrow") -ncbi_names = DataFrames.DataFrame(tax_id=Int[], name=String[], unique_name=Union{String,Missing}[], class=NCBINameClass[]) +# Get the data for the names +ncbi_names_file_in = joinpath(local_path, "dump", "names.dmp") +ncbi_names_file_out = joinpath(tables_path, "names.arrow") +ncbi_names = DataFrames.DataFrame(; + tax_id = Int[], + name = String[], + unique_name = Union{String, Missing}[], + class = NCBINameClass[], +) names_df = _build_arrow_file(ncbi_names, ncbi_names_file_in) names_df.class = Int.(names_df.class) Arrow.write(ncbi_names_file_out, names_df) names_df = nothing GC.gc() -@info "Building the division file" -ncbi_division_file_in = joinpath(taxpath, "dump", "division.dmp") -ncbi_division_file_out = joinpath(taxpath, "tables", "division.arrow") -ncbi_division = DataFrames.DataFrame(division_id=Int[], division_code=Symbol[], division_name=Symbol[], comments=Union{String,Missing}[]) +ncbi_division_file_in = joinpath(local_path, "dump", "division.dmp") +ncbi_division_file_out = joinpath(tables_path, "division.arrow") +ncbi_division = DataFrames.DataFrame(; + division_id = Int[], + division_code = Symbol[], + division_name = Symbol[], + comments = Union{String, Missing}[], +) division_df = _build_arrow_file(ncbi_division, ncbi_division_file_in) Arrow.write(ncbi_division_file_out, division_df) division_df = nothing GC.gc() -@info "Building the nodes file" -ncbi_nodes_file_in = joinpath(taxpath, "dump", "nodes.dmp") -ncbi_nodes_file_out = joinpath(taxpath, "tables", "nodes.arrow") -ncbi_nodes = DataFrames.DataFrame( - tax_id=Int[], parent_tax_id=Int[], - rank=Symbol[], - embl=Union{String,Missing}[], - division_id=Int[], inherited_div=Union{Bool,Missing}[], - genetic_code_id=Int[], inherited_gc=Union{Bool,Missing}[], - mitochondrial_genetic_code_id=Union{Int,Missing}[], inherited_mgc=Union{Bool,Missing}[], - genbank_hidden=Union{Bool,Missing}[], - hidden_subtree=Union{Bool,Missing}[], - comments=Union{String,Missing}[], - plastid_genetic_code_id=Union{Int,Missing}[], inherited_pgc=Union{Bool,Missing}[], - specified_species=Union{Bool,Missing}[], - hydrogenosome_code_id=Union{Int,Missing}[], inherited_hgc=Union{Bool,Missing}[] - ) +ncbi_nodes_file_in = joinpath(local_path, "dump", "nodes.dmp") +ncbi_nodes_file_out = joinpath(tables_path, "nodes.arrow") +ncbi_nodes = DataFrames.DataFrame(; + tax_id = Int[], parent_tax_id = Int[], + rank = Symbol[], + embl = Union{String, Missing}[], + division_id = Int[], inherited_div = Union{Bool, Missing}[], + genetic_code_id = Int[], inherited_gc = Union{Bool, Missing}[], + mitochondrial_genetic_code_id = Union{Int, Missing}[], + inherited_mgc = Union{Bool, Missing}[], + genbank_hidden = Union{Bool, Missing}[], + hidden_subtree = Union{Bool, Missing}[], + comments = Union{String, Missing}[], + plastid_genetic_code_id = Union{Int, Missing}[], + inherited_pgc = Union{Bool, Missing}[], + specified_species = Union{Bool, Missing}[], + hydrogenosome_code_id = Union{Int, Missing}[], + inherited_hgc = Union{Bool, Missing}[], +) nodes_df = _build_arrow_file(ncbi_nodes, ncbi_nodes_file_in) Arrow.write(ncbi_nodes_file_out, nodes_df) nodes_df = nothing diff --git a/src/hydrate.jl b/src/hydrate.jl index 0cf76b6..b6c3011 100644 --- a/src/hydrate.jl +++ b/src/hydrate.jl @@ -55,17 +55,16 @@ function _unpack_if_needed(local_path, remote_info, remote_checksum) need_update = false if ~isfile(local_archive) local_archive = _download_archive(local_path, remote_info) - local_checksum = bytes2hex(open(MD5.md5, local_archive)) need_update = true - else - local_checksum = bytes2hex(open(MD5.md5, local_archive)) end + local_checksum = bytes2hex(open(MD5.md5, local_archive)) if local_checksum != remote_checksum local_archive = _download_archive(local_path, remote_info) local_checksum = bytes2hex(open(MD5.md5, local_archive)) need_update = true end if need_update + rm(joinpath(local_path, "dump"); force=true) Tar.extract(GZip.open(local_archive), joinpath(local_path, "dump")) end return joinpath(local_path, "dump") From eae76182675c8e002b683c5be53436d1393d7fdc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Timoth=C3=A9e=20Poisot?= Date: Sat, 4 Mar 2023 16:24:01 -0500 Subject: [PATCH 03/25] =?UTF-8?q?=E2=9C=A8=20build=20step=20cleaned=20up?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/hydrate.jl | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/hydrate.jl b/src/hydrate.jl index b6c3011..a3902df 100644 --- a/src/hydrate.jl +++ b/src/hydrate.jl @@ -54,16 +54,19 @@ function _unpack_if_needed(local_path, remote_info, remote_checksum) local_archive = joinpath(local_path, remote_info.archive) need_update = false if ~isfile(local_archive) + @warn "There is no local taxonomy dump, we will download one" local_archive = _download_archive(local_path, remote_info) need_update = true end local_checksum = bytes2hex(open(MD5.md5, local_archive)) if local_checksum != remote_checksum + @warn "The checksum of the taxonomy dump does not match the remote" local_archive = _download_archive(local_path, remote_info) local_checksum = bytes2hex(open(MD5.md5, local_archive)) need_update = true end if need_update + @warn "We are unpacking the local taxonomy dump" rm(joinpath(local_path, "dump"); force=true) Tar.extract(GZip.open(local_archive), joinpath(local_path, "dump")) end From 16838013d4fccdd0dd96d0877780eccd10eba39d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Timoth=C3=A9e=20Poisot?= Date: Sat, 4 Mar 2023 16:43:24 -0500 Subject: [PATCH 04/25] =?UTF-8?q?=E2=9A=99=EF=B8=8F=20move=20the=20code=20?= =?UTF-8?q?to=20read=20the=20taxonomy=20out=20of=20the=20main=20file?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- deps/build.jl | 3 ++- {src => deps}/hydrate.jl | 23 ----------------------- src/NCBITaxonomy.jl | 35 +++++++---------------------------- src/local_archive_path.jl | 28 ++++++++++++++++++++++++++++ src/read_taxonomy.jl | 34 ++++++++++++++++++++++++++++++++++ 5 files changed, 71 insertions(+), 52 deletions(-) rename {src => deps}/hydrate.jl (69%) create mode 100644 src/local_archive_path.jl create mode 100644 src/read_taxonomy.jl diff --git a/deps/build.jl b/deps/build.jl index eb580b3..5eaaa56 100644 --- a/deps/build.jl +++ b/deps/build.jl @@ -7,8 +7,9 @@ import Downloads # There are two things we need for the build process: the types, and the # location of the files -include(joinpath(@__DIR__, "..", "src", "hydrate.jl")) +include(joinpath(@__DIR__, "hydrate.jl")) include(joinpath(@__DIR__, "..", "src", "types.jl")) +include(joinpath(@__DIR__, "..", "src", "local_archive_path.jl")) # These steps are meant to download and unpack the taxonomy as needed, which is # to say as unfrequently as possible diff --git a/src/hydrate.jl b/deps/hydrate.jl similarity index 69% rename from src/hydrate.jl rename to deps/hydrate.jl index a3902df..8504e64 100644 --- a/src/hydrate.jl +++ b/deps/hydrate.jl @@ -11,29 +11,6 @@ function _remote_archive_path(; ) end -""" - _local_archive_path() - -Returns the path where the taxonomy dump is stored, and throws a warning if the -path is not set as an environmental variable. This is used during the build step -*and* during the initial startup of the package. -""" -function _local_archive_path()::String - if !haskey(ENV, "NCBITAXONOMY_PATH") - @warn """ - The environmental variable NCBITAXONOMY_PATH is not set, so the tables will - be stored in your home directory. This is not ideal, and you really should set - the NCBITAXONOMY_PATH. - - This can be done by adding `ENV["NCBITAXONOMY_PATH"]` in your Julia startup - file. The path will be created automatically if it does not exist. - """ - end - taxpath = get(ENV, "NCBITAXONOMY_PATH", joinpath(homedir(), "NCBITaxonomy")) - ispath(taxpath) || mkpath(taxpath) - return taxpath -end - function _get_current_remote_checksum(local_path, remote_info) chk_file = Downloads.download( remote_info.url * remote_info.checksum, diff --git a/src/NCBITaxonomy.jl b/src/NCBITaxonomy.jl index fc27fbe..bf5a69a 100644 --- a/src/NCBITaxonomy.jl +++ b/src/NCBITaxonomy.jl @@ -1,15 +1,15 @@ module NCBITaxonomy using DataFrames -using Arrow +import Arrow using StringDistances using AbstractTrees # Point to where the taxonomy is located -include("hydrate.jl") -local_path = _local_archive_path() +include("local_archive_path.jl") +tables_path = _create_or_get_tables_path(_local_archive_path()) function __init__() - name_date = mtime(joinpath(local_path, "tables", "names.arrow")) + name_date = mtime(joinpath(tables_path, "names.arrow")) over_30_days = time() - name_date >= 2.6e+6 if over_30_days @warn("Your local taxonomy version is over 30 days old, we recommend using `] build NCBITaxonomy` to get the most recent version.") @@ -23,30 +23,9 @@ export NCBITaxon, NCBINameClass, IDNotFoundInBackbone include("exceptions.jl") export NameHasNoDirectMatch, NameHasMultipleMatches -names_table = DataFrame(Arrow.Table(joinpath(taxpath, "tables", "names.arrow"))) -names_table.class = NCBINameClass.(names_table.class) -names_table.lowercase = lowercase.(names_table.name) - -division_table = DataFrame(Arrow.Table(joinpath(taxpath, "tables", "division.arrow"))) -select!(division_table, Not(:comments)) - -nodes_table = DataFrame(Arrow.Table(joinpath(taxpath, "tables", "nodes.arrow"))) -select!(nodes_table, Not(r"inherited_")) -select!(nodes_table, Not(r"_code_id")) -select!(nodes_table, Not(:genbank_hidden)) -select!(nodes_table, Not(:hidden_subtree)) -select!(nodes_table, Not(:comments)) -select!(nodes_table, Not(:embl)) - -nodes_table = innerjoin(nodes_table, division_table; on = :division_id) -select!(nodes_table, Not(:division_id)) - -names_table = leftjoin( - names_table, - unique(select(nodes_table, [:tax_id, :rank, :parent_tax_id])); - on = :tax_id, -) -scinames_table = names_table[findall(names_table.class .== class_scientific_name), :] +# We load the core file with all we need in it +include("read_taxonomy.jl") +taxonomy = read_taxonomy(tables_path) include("taxon.jl") export taxon, @ncbi_str diff --git a/src/local_archive_path.jl b/src/local_archive_path.jl new file mode 100644 index 0000000..252ca2c --- /dev/null +++ b/src/local_archive_path.jl @@ -0,0 +1,28 @@ +""" + _local_archive_path() + +Returns the path where the taxonomy dump is stored, and throws a warning if the +path is not set as an environmental variable. This is used during the build step +*and* during the initial startup of the package. +""" +function _local_archive_path()::String + if !haskey(ENV, "NCBITAXONOMY_PATH") + @warn """ + The environmental variable NCBITAXONOMY_PATH is not set, so the tables will + be stored in your home directory. This is not ideal, and you really should set + the NCBITAXONOMY_PATH. + + This can be done by adding `ENV["NCBITAXONOMY_PATH"]` in your Julia startup + file. The path will be created automatically if it does not exist. + """ + end + taxpath = get(ENV, "NCBITAXONOMY_PATH", joinpath(homedir(), "NCBITaxonomy")) + ispath(taxpath) || mkpath(taxpath) + return taxpath +end + +function _create_or_get_tables_path(local_path) + tables_path = joinpath(local_path, "tables") + ispath(tables_path) || mkpath(tables_path) + return tables_path +end \ No newline at end of file diff --git a/src/read_taxonomy.jl b/src/read_taxonomy.jl new file mode 100644 index 0000000..4a43f6e --- /dev/null +++ b/src/read_taxonomy.jl @@ -0,0 +1,34 @@ +function read_taxonomy(tables_path) + + # Prepare the files we will actually use + names_table = DataFrame(Arrow.Table(joinpath(tables_path, "names.arrow"))) + names_table.class = NCBINameClass.(names_table.class) + names_table.lowercase = lowercase.(names_table.name) + + division_table = DataFrame(Arrow.Table(joinpath(tables_path, "division.arrow"))) + select!(division_table, Not(:comments)) + + nodes_table = DataFrame(Arrow.Table(joinpath(tables_path, "nodes.arrow"))) + select!(nodes_table, Not(r"inherited_")) + select!(nodes_table, Not(r"_code_id")) + select!(nodes_table, Not(:genbank_hidden)) + select!(nodes_table, Not(:hidden_subtree)) + select!(nodes_table, Not(:comments)) + select!(nodes_table, Not(:embl)) + + nodes_table = innerjoin(nodes_table, division_table; on = :division_id) + select!(nodes_table, Not(:division_id)) + + names_table = leftjoin( + names_table, + unique(select(nodes_table, [:tax_id, :rank, :parent_tax_id])); + on = :tax_id, + ) + + nodes_table = nothing + division_table = nothing + GC.gc() + + return names_table + +end \ No newline at end of file From 36306cf75ca71128e811db2402dfaac2619013e8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Timoth=C3=A9e=20Poisot?= Date: Sat, 4 Mar 2023 16:56:24 -0500 Subject: [PATCH 05/25] =?UTF-8?q?=E2=9A=A1=20only=20keep=20two=20taxo=20db?= =?UTF-8?q?s?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/NCBITaxonomy.jl | 12 ++++++++---- src/interfaces/abstracttrees.jl | 16 ++++++++-------- src/namefilters/namefilter.jl | 2 +- src/taxon.jl | 6 +++--- src/utility/nametools.jl | 6 +++--- src/utility/similarnames.jl | 6 ++---- 6 files changed, 25 insertions(+), 23 deletions(-) diff --git a/src/NCBITaxonomy.jl b/src/NCBITaxonomy.jl index bf5a69a..160cf29 100644 --- a/src/NCBITaxonomy.jl +++ b/src/NCBITaxonomy.jl @@ -11,21 +11,25 @@ tables_path = _create_or_get_tables_path(_local_archive_path()) function __init__() name_date = mtime(joinpath(tables_path, "names.arrow")) over_30_days = time() - name_date >= 2.6e+6 - if over_30_days - @warn("Your local taxonomy version is over 30 days old, we recommend using `] build NCBITaxonomy` to get the most recent version.") + if over_30_days + @warn( + "Your local taxonomy version is over 30 days old, we recommend using `] build NCBITaxonomy` to get the most recent version." + ) end return nothing end include("types.jl") -export NCBITaxon, NCBINameClass, IDNotFoundInBackbone +export NCBITaxon, NCBINameClass include("exceptions.jl") -export NameHasNoDirectMatch, NameHasMultipleMatches +export NameHasNoDirectMatch, NameHasMultipleMatches, IDNotFoundInBackbone # We load the core file with all we need in it include("read_taxonomy.jl") taxonomy = read_taxonomy(tables_path) +scinames = filter(r -> r.class == NCBITaxonomy.class_scientific_name, taxonomy) +groupedscinames = groupby(scinames, :tax_id) include("taxon.jl") export taxon, @ncbi_str diff --git a/src/interfaces/abstracttrees.jl b/src/interfaces/abstracttrees.jl index b50b24a..089d274 100644 --- a/src/interfaces/abstracttrees.jl +++ b/src/interfaces/abstracttrees.jl @@ -4,13 +4,13 @@ Returns the children of a taxon. """ function AbstractTrees.children(tax::NCBITaxon) - positions = findall(isequal(tax.id), NCBITaxonomy.scinames_table.parent_tax_id) + positions = findall(isequal(tax.id), NCBITaxonomy.scinames.parent_tax_id) if ~isempty(positions) list_of_children = Vector{NCBITaxon}(undef, length(positions)) for i in axes(positions, 1) list_of_children[i] = NCBITaxon( - NCBITaxonomy.scinames_table.name[positions[i]], - NCBITaxonomy.scinames_table.tax_id[positions[i]], + NCBITaxonomy.scinames.name[positions[i]], + NCBITaxonomy.scinames.tax_id[positions[i]], ) end return list_of_children @@ -25,15 +25,15 @@ end Returns the taxon from which the argument taxon is descended. """ function AbstractTrees.parent(tax::NCBITaxon) - position = findfirst(isequal(tax.id), NCBITaxonomy.scinames_table.tax_id) + position = findfirst(isequal(tax.id), NCBITaxonomy.scinames.tax_id) if ~isnothing(position) parent_position = findfirst( - isequal(NCBITaxonomy.scinames_table.parent_tax_id[position]), - NCBITaxonomy.scinames_table.tax_id, + isequal(NCBITaxonomy.scinames.parent_tax_id[position]), + NCBITaxonomy.scinames.tax_id, ) return NCBITaxon( - NCBITaxonomy.scinames_table.name[parent_position], - NCBITaxonomy.scinames_table.tax_id[parent_position], + NCBITaxonomy.scinames.name[parent_position], + NCBITaxonomy.scinames.tax_id[parent_position], ) else return nothing diff --git a/src/namefilters/namefilter.jl b/src/namefilters/namefilter.jl index 24a9960..56e1d42 100644 --- a/src/namefilters/namefilter.jl +++ b/src/namefilters/namefilter.jl @@ -4,7 +4,7 @@ Returns a subset of the names table where only the given taxids are present. """ function namefilter(ids::Vector{T}) where {T <: Integer} - return leftjoin(DataFrame(; tax_id = ids), NCBITaxonomy.names_table; on = :tax_id) + return leftjoin(DataFrame(; tax_id = ids), NCBITaxonomy.taxonomy; on = :tax_id) end """ diff --git a/src/taxon.jl b/src/taxon.jl index 2d14c1b..eb08a6a 100644 --- a/src/taxon.jl +++ b/src/taxon.jl @@ -15,10 +15,10 @@ end Performs a search in the entire taxonomy backbone based on a known ID. """ -taxon(id::Integer) = taxon(NCBITaxonomy.scinames_table, id) +taxon(id::Integer) = taxon(NCBITaxonomy.scinames, id) function _id_from_name(name::AbstractString; kwargs...) - return _id_from_name(NCBITaxonomy.names_table, name; kwargs...) + return _id_from_name(NCBITaxonomy.taxonomy, name; kwargs...) end function _id_from_name( @@ -91,7 +91,7 @@ The keywords are: - `onlysynonyms` (def. `false`) - limits the search to synonyms, which may be useful in case the taxonomy is particularly outdated """ -taxon(name::AbstractString; kwargs...) = taxon(NCBITaxonomy.names_table, name; kwargs...) +taxon(name::AbstractString; kwargs...) = taxon(NCBITaxonomy.taxonomy, name; kwargs...) """ taxon(df::DataFrame, name::AbstractString; kwargs...) diff --git a/src/utility/nametools.jl b/src/utility/nametools.jl index 59377f5..6d08f98 100644 --- a/src/utility/nametools.jl +++ b/src/utility/nametools.jl @@ -6,7 +6,7 @@ of names if found. It searches the "common name" and "genbank common name" category of the NCBI taxonomy name table. """ function vernacular(t::NCBITaxon) - x = NCBITaxonomy.names_table[findall(NCBITaxonomy.names_table.tax_id .== t.id), :] + x = NCBITaxonomy.taxonomy[findall(NCBITaxonomy.taxonomy.tax_id .== t.id), :] p = findall( !isnothing, indexin( @@ -24,7 +24,7 @@ This function will return `nothing` if no synonyms exist, and an array of names if they do. It returns all of the """ function synonyms(t::NCBITaxon) - x = NCBITaxonomy.names_table[findall(NCBITaxonomy.names_table.tax_id .== t.id), :] + x = NCBITaxonomy.taxonomy[findall(NCBITaxonomy.taxonomy.tax_id .== t.id), :] p = findall(isequal(NCBITaxonomy.class_synonym), x.class) return length(p) == 0 ? nothing : x.name[p] end @@ -36,7 +36,7 @@ This function will return `nothing` if no authority exist, and a string with the authority if found. """ function authority(t::NCBITaxon) - x = NCBITaxonomy.names_table[findall(NCBITaxonomy.names_table.tax_id .== t.id), :] + x = NCBITaxonomy.taxonomy[findall(NCBITaxonomy.taxonomy.tax_id .== t.id), :] p = findall(isequal(NCBITaxonomy.class_authority), x.class) return length(p) == 0 ? nothing : first(x.name[p]) end diff --git a/src/utility/similarnames.jl b/src/utility/similarnames.jl index 218f571..2511eee 100644 --- a/src/utility/similarnames.jl +++ b/src/utility/similarnames.jl @@ -19,8 +19,7 @@ distance - the pair will be returned only once. Additional keywords are `rank` (limit to a given rank) and `onlysynonyms`. """ function similarnames(name::AbstractString; kwargs...) - df = NCBITaxonomy.names_table - return similarnames(df, name; kwargs...) + return similarnames(NCBITaxonomy.taxonomy, name; kwargs...) end """ @@ -72,6 +71,5 @@ does *strict*, *case-sensitive* searches only at the moment, but this may be extended through keyword arguments in a future release. """ function alternativetaxa(name::AbstractString) - df = NCBITaxonomy.names_table - return alternativetaxa(df, name) + return alternativetaxa(NCBITaxonomy.taxonomy, name) end \ No newline at end of file From 9340b8598f6dafde85f2cb4706e93f293923e2a9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Timoth=C3=A9e=20Poisot?= Date: Sat, 4 Mar 2023 16:57:34 -0500 Subject: [PATCH 06/25] =?UTF-8?q?=E2=9E=95=20Downloads=20(tmp.=20Revise)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- Project.toml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/Project.toml b/Project.toml index 63d4dac..eaf79e3 100644 --- a/Project.toml +++ b/Project.toml @@ -7,8 +7,10 @@ version = "0.4.1" AbstractTrees = "1520ce14-60c1-5f80-bbc7-55ef81b5835c" Arrow = "69666777-d1a9-59fb-9406-91d4454c9d45" DataFrames = "a93c6f00-e57d-5684-b7b6-d8193f3e46c0" +Downloads = "f43a241f-c20a-4ad4-852c-f6b1247861c6" GZip = "92fee26a-97fe-5a0c-ad85-20a5f3185b63" MD5 = "6ac74813-4b46-53a4-afec-0b5dc9d7885c" +Revise = "295af30f-e4ad-537b-8983-00126c2a3abe" StringDistances = "88034a9c-02f8-509d-84a9-84ec65e18404" Tar = "a4e569a6-e804-4fa4-b0f3-eef7a1d5b13e" From b65d0dd52b680e3679450b33010bf3b54f9945a3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Timoth=C3=A9e=20Poisot?= Date: Sat, 4 Mar 2023 17:10:41 -0500 Subject: [PATCH 07/25] =?UTF-8?q?=E2=9A=A1=20speedy=20lookup=20by=20ID?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/taxon.jl | 23 +++++++++-------------- 1 file changed, 9 insertions(+), 14 deletions(-) diff --git a/src/taxon.jl b/src/taxon.jl index eb08a6a..fef410a 100644 --- a/src/taxon.jl +++ b/src/taxon.jl @@ -1,21 +1,16 @@ -""" - taxon(df::DataFrame, id::Integer) - -Returns a fully formed `NCBITaxon` based on its id. The `name` of the taxon -will be the valid scientic name associated to this id. -""" -function taxon(df::DataFrame, id::Integer) - matched_index = findfirst(isequal(id), df.tax_id) - isnothing(matched_index) && throw(IDNotFoundInBackbone(id)) - return NCBITaxon(df.name[matched_index], id) -end - """ taxon(id::Integer) -Performs a search in the entire taxonomy backbone based on a known ID. +Performs a search in the entire taxonomy backbone based on a known ID. This is +the fastest way to get to a taxon, and is used internally by the tree traversal methods. """ -taxon(id::Integer) = taxon(NCBITaxonomy.scinames, id) +function taxon(id::Integer) + (id in NCBITaxonomy.scinames.tax_id) || throw(IDNotFoundInBackbone(id)) + return NCBITaxon( + only(NCBITaxonomy.groupedscinames[id].name), + only(NCBITaxonomy.groupedscinames[id].tax_id) + ) +end function _id_from_name(name::AbstractString; kwargs...) return _id_from_name(NCBITaxonomy.taxonomy, name; kwargs...) From 4933e8303095259fa6a536f4f9c2a47e51874f9d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Timoth=C3=A9e=20Poisot?= Date: Sat, 4 Mar 2023 17:17:03 -0500 Subject: [PATCH 08/25] =?UTF-8?q?=E2=9A=99=EF=B8=8F=20use=20filter=20in=20?= =?UTF-8?q?namefilter?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/namefilters/namefilter.jl | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/src/namefilters/namefilter.jl b/src/namefilters/namefilter.jl index 56e1d42..ccd5cc7 100644 --- a/src/namefilters/namefilter.jl +++ b/src/namefilters/namefilter.jl @@ -23,8 +23,7 @@ end Returns a subset of the names table for all names under a given NCBI division. """ function namefilter(division::Symbol) - ids = findall(isequal(division), NCBITaxonomy.nodes_table.division_code) - return namefilter(NCBITaxonomy.nodes_table.tax_id[ids]) + return namefilter(filter(r -> r.division_code == division, NCBITaxonomy.taxonomy)) end """ @@ -34,6 +33,5 @@ Returns a subset of the names table for all names under a number of multiple NCBI divisions. """ function namefilter(division::Vector{Symbol}) - ids = findall(x -> x in division, NCBITaxonomy.nodes_table.division_code) - return namefilter(NCBITaxonomy.nodes_table.tax_id[ids]) + return namefilter(filter(r -> r.division_code in division, NCBITaxonomy.taxonomy)) end \ No newline at end of file From dd430bceaf2dc9dc1554bfef561c29b8b66c52bf Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Timoth=C3=A9e=20Poisot?= Date: Sat, 4 Mar 2023 17:17:17 -0500 Subject: [PATCH 09/25] =?UTF-8?q?=E2=9A=99=EF=B8=8F=20keep=20division=20co?= =?UTF-8?q?de?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/read_taxonomy.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/read_taxonomy.jl b/src/read_taxonomy.jl index 4a43f6e..347132a 100644 --- a/src/read_taxonomy.jl +++ b/src/read_taxonomy.jl @@ -21,7 +21,7 @@ function read_taxonomy(tables_path) names_table = leftjoin( names_table, - unique(select(nodes_table, [:tax_id, :rank, :parent_tax_id])); + unique(select(nodes_table, [:tax_id, :rank, :parent_tax_id, :division_code])); on = :tax_id, ) From efd090256a69044fbd3299ce0d1f65c2561fbb15 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Timoth=C3=A9e=20Poisot?= Date: Sat, 4 Mar 2023 17:17:31 -0500 Subject: [PATCH 10/25] =?UTF-8?q?=E2=9A=A1=20direct=20lookup=20for=20rank?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/lineage/rank.jl | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/lineage/rank.jl b/src/lineage/rank.jl index 988ec70..a8bd880 100644 --- a/src/lineage/rank.jl +++ b/src/lineage/rank.jl @@ -4,6 +4,5 @@ Returns the rank of a taxon. """ function rank(tax::NCBITaxon) - position = findfirst(isequal(tax.id), NCBITaxonomy.nodes_table.tax_id) - return NCBITaxonomy.nodes_table.rank[position] + return only(NCBITaxonomy.groupedscinames[tax.id].rank) end From b276c7bda7af37822dcf3de7a7e3ad83325881d8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Timoth=C3=A9e=20Poisot?= Date: Sat, 4 Mar 2023 17:29:10 -0500 Subject: [PATCH 11/25] =?UTF-8?q?=F0=9F=90=9B=20namefilter=20from=20df=20f?= =?UTF-8?q?ilter?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/namefilters/namefilter.jl | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/namefilters/namefilter.jl b/src/namefilters/namefilter.jl index ccd5cc7..1e3a588 100644 --- a/src/namefilters/namefilter.jl +++ b/src/namefilters/namefilter.jl @@ -23,7 +23,7 @@ end Returns a subset of the names table for all names under a given NCBI division. """ function namefilter(division::Symbol) - return namefilter(filter(r -> r.division_code == division, NCBITaxonomy.taxonomy)) + return filter(r -> r.division_code == division, NCBITaxonomy.taxonomy) end """ @@ -33,5 +33,5 @@ Returns a subset of the names table for all names under a number of multiple NCBI divisions. """ function namefilter(division::Vector{Symbol}) - return namefilter(filter(r -> r.division_code in division, NCBITaxonomy.taxonomy)) + return filter(r -> r.division_code in division, NCBITaxonomy.taxonomy) end \ No newline at end of file From 518faa955c06bc51b98e7f55010ecce7feb295a6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Timoth=C3=A9e=20Poisot?= Date: Sat, 4 Mar 2023 17:36:08 -0500 Subject: [PATCH 12/25] =?UTF-8?q?=F0=9F=90=9B=20index=20by=20tuple?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/lineage/rank.jl | 2 +- src/taxon.jl | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/src/lineage/rank.jl b/src/lineage/rank.jl index a8bd880..6426301 100644 --- a/src/lineage/rank.jl +++ b/src/lineage/rank.jl @@ -4,5 +4,5 @@ Returns the rank of a taxon. """ function rank(tax::NCBITaxon) - return only(NCBITaxonomy.groupedscinames[tax.id].rank) + return only(NCBITaxonomy.groupedscinames[(tax_id = tax.id,)].rank) end diff --git a/src/taxon.jl b/src/taxon.jl index fef410a..310aa73 100644 --- a/src/taxon.jl +++ b/src/taxon.jl @@ -7,8 +7,8 @@ the fastest way to get to a taxon, and is used internally by the tree traversal function taxon(id::Integer) (id in NCBITaxonomy.scinames.tax_id) || throw(IDNotFoundInBackbone(id)) return NCBITaxon( - only(NCBITaxonomy.groupedscinames[id].name), - only(NCBITaxonomy.groupedscinames[id].tax_id) + only(NCBITaxonomy.groupedscinames[(tax_id = id, )].name), + only(NCBITaxonomy.groupedscinames[(tax_id = id, )].tax_id) ) end From 276de150ba8338c303ff18726f7f3f6e587a8f90 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Timoth=C3=A9e=20Poisot?= Date: Sat, 4 Mar 2023 17:44:20 -0500 Subject: [PATCH 13/25] =?UTF-8?q?=E2=9A=A1=20update=20some=20benchmarks?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- benchmark/benchmarks.jl | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/benchmark/benchmarks.jl b/benchmark/benchmarks.jl index de3be02..65ed0e7 100644 --- a/benchmark/benchmarks.jl +++ b/benchmark/benchmarks.jl @@ -13,6 +13,9 @@ tax = [ mf = mammalfilter() +mf_t = mammalfilter(true) +pf = primatefilter() + const SUITE = BenchmarkGroup() # Construction of name finders @@ -41,6 +44,14 @@ SUITE["taxon search"]["lowercase with finder"] = SUITE["taxon search"]["scientific with finder"] = @benchmarkable taxon(mf, "Sus scrofa"; preferscientific = true) +SUITE["taxon search"]["pan - all defaults"] = @benchmarkable taxon("Pan") + +SUITE["taxon search"]["pan - mammal finder"] = @benchmarkable taxon(mf_t, "Pan") + +SUITE["taxon search"]["pan - primate finder"] = @benchmarkable taxon(pf, "Pan") + +SUITE["taxon search"]["pan - string annotation"] = @benchmarkable ncbi"Pan" + # Ability to traverse a tree SUITE["traversal"] = BenchmarkGroup(["search", "tree traversal"]) From 1e8170602e5b835da0bc23cba81165f79f89ba5b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Timoth=C3=A9e=20Poisot?= Date: Sat, 4 Mar 2023 17:59:49 -0500 Subject: [PATCH 14/25] =?UTF-8?q?=E2=9A=A1=20update=20the=20speed=20of=20n?= =?UTF-8?q?amefinders?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/namefilters/namefilter.jl | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/namefilters/namefilter.jl b/src/namefilters/namefilter.jl index 1e3a588..568cea2 100644 --- a/src/namefilters/namefilter.jl +++ b/src/namefilters/namefilter.jl @@ -23,7 +23,7 @@ end Returns a subset of the names table for all names under a given NCBI division. """ function namefilter(division::Symbol) - return filter(r -> r.division_code == division, NCBITaxonomy.taxonomy) + return groupby(NCBITaxonomy.taxonomy, :division_code)[(division_code = division, )] end """ @@ -33,5 +33,5 @@ Returns a subset of the names table for all names under a number of multiple NCBI divisions. """ function namefilter(division::Vector{Symbol}) - return filter(r -> r.division_code in division, NCBITaxonomy.taxonomy) + return groupby(NCBITaxonomy.taxonomy, :division_code)[[(division_code = div,) for div in division]] end \ No newline at end of file From c749b0df8e8e54eafbbed8b4141a146ca4e75d23 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Timoth=C3=A9e=20Poisot?= Date: Sat, 4 Mar 2023 18:09:55 -0500 Subject: [PATCH 15/25] =?UTF-8?q?=E2=9A=A1=20minimize=20memory=20footprint?= =?UTF-8?q?=20of=20the=20table?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/read_taxonomy.jl | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/read_taxonomy.jl b/src/read_taxonomy.jl index 347132a..bf1e9bd 100644 --- a/src/read_taxonomy.jl +++ b/src/read_taxonomy.jl @@ -29,6 +29,9 @@ function read_taxonomy(tables_path) division_table = nothing GC.gc() + select!(names_table, Not(:unique_name)) + dropmissing!(names_table, [:rank, :parent_tax_id, :division_code]) + return names_table end \ No newline at end of file From 01a7ca7a5b8814c0095e5553b86cd343b6cb2135 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Timoth=C3=A9e=20Poisot?= Date: Sat, 4 Mar 2023 18:15:32 -0500 Subject: [PATCH 16/25] =?UTF-8?q?=F0=9F=90=9B=20make=20taxon=20work=20with?= =?UTF-8?q?=20abstract=20dfs?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/taxon.jl | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/src/taxon.jl b/src/taxon.jl index 310aa73..e3a0351 100644 --- a/src/taxon.jl +++ b/src/taxon.jl @@ -7,8 +7,8 @@ the fastest way to get to a taxon, and is used internally by the tree traversal function taxon(id::Integer) (id in NCBITaxonomy.scinames.tax_id) || throw(IDNotFoundInBackbone(id)) return NCBITaxon( - only(NCBITaxonomy.groupedscinames[(tax_id = id, )].name), - only(NCBITaxonomy.groupedscinames[(tax_id = id, )].tax_id) + only(NCBITaxonomy.groupedscinames[(tax_id = id,)].name), + only(NCBITaxonomy.groupedscinames[(tax_id = id,)].tax_id), ) end @@ -17,7 +17,7 @@ function _id_from_name(name::AbstractString; kwargs...) end function _id_from_name( - df::DataFrame, + df::T, name::AbstractString; strict::Bool = true, dist::Type{SD} = Levenshtein, @@ -25,7 +25,7 @@ function _id_from_name( rank::Union{Nothing, Symbol} = nothing, preferscientific::Bool = false, onlysynonyms::Bool = false, -) where {SD <: StringDistance} +) where {SD <: StringDistance, T <: AbstractDataFrame} if !isnothing(rank) @assert rank ∈ unique(df.rank) df = df[findall(isequal(rank), df.rank), :] @@ -94,7 +94,7 @@ taxon(name::AbstractString; kwargs...) = taxon(NCBITaxonomy.taxonomy, name; kwar Additional method for `taxon` with an extra dataframe argument, used most often with a `namefinder`. Accepts the usual `taxon` keyword arguments. """ -function taxon(df::DataFrame, name::AbstractString; kwargs...) +function taxon(df::T, name::String; kwargs...) where {T <: AbstractDataFrame} id = _id_from_name(df, name; kwargs...) isnothing(id) && return nothing return taxon(id) From 3ebcc5a82e9fe4c8f1336855f2c1fe0f315ad2ee Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Timoth=C3=A9e=20Poisot?= Date: Sat, 4 Mar 2023 18:16:45 -0500 Subject: [PATCH 17/25] =?UTF-8?q?=F0=9F=A6=86=20make=20sure=20name=20is=20?= =?UTF-8?q?a=20String?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/taxon.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/taxon.jl b/src/taxon.jl index e3a0351..98dab01 100644 --- a/src/taxon.jl +++ b/src/taxon.jl @@ -18,7 +18,7 @@ end function _id_from_name( df::T, - name::AbstractString; + name::String; strict::Bool = true, dist::Type{SD} = Levenshtein, casesensitive::Bool = true, From 46f8a52d77731342e1db67311ade6f5d6cf59da6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Timoth=C3=A9e=20Poisot?= Date: Sat, 4 Mar 2023 18:18:33 -0500 Subject: [PATCH 18/25] =?UTF-8?q?=F0=9F=9A=A7=20allow=20missings?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/read_taxonomy.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/read_taxonomy.jl b/src/read_taxonomy.jl index bf1e9bd..226d691 100644 --- a/src/read_taxonomy.jl +++ b/src/read_taxonomy.jl @@ -30,7 +30,7 @@ function read_taxonomy(tables_path) GC.gc() select!(names_table, Not(:unique_name)) - dropmissing!(names_table, [:rank, :parent_tax_id, :division_code]) + #dropmissing!(names_table, [:rank, :parent_tax_id, :division_code]) return names_table From 79e2dde08d4ea693a66ed860b5ae446b6abdfdba Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Timoth=C3=A9e=20Poisot?= Date: Sat, 4 Mar 2023 20:49:37 -0500 Subject: [PATCH 19/25] =?UTF-8?q?=F0=9F=90=9B=20return=20data=20frame=20fo?= =?UTF-8?q?r=20filtering=20on=20multiple=20divisions?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/namefilters/namefilter.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/namefilters/namefilter.jl b/src/namefilters/namefilter.jl index 568cea2..2e43ac9 100644 --- a/src/namefilters/namefilter.jl +++ b/src/namefilters/namefilter.jl @@ -33,5 +33,5 @@ Returns a subset of the names table for all names under a number of multiple NCBI divisions. """ function namefilter(division::Vector{Symbol}) - return groupby(NCBITaxonomy.taxonomy, :division_code)[[(division_code = div,) for div in division]] + return vcat(groupby(NCBITaxonomy.taxonomy, :division_code)[[(division_code = div,) for div in division]]...) end \ No newline at end of file From de7c44132589d2f95f353650ad203a96a7f69d17 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Timoth=C3=A9e=20Poisot?= Date: Sat, 4 Mar 2023 20:56:44 -0500 Subject: [PATCH 20/25] =?UTF-8?q?=E2=9A=A1=20authority?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/NCBITaxonomy.jl | 1 + src/utility/nametools.jl | 10 ++++++---- 2 files changed, 7 insertions(+), 4 deletions(-) diff --git a/src/NCBITaxonomy.jl b/src/NCBITaxonomy.jl index 160cf29..2d4bf8d 100644 --- a/src/NCBITaxonomy.jl +++ b/src/NCBITaxonomy.jl @@ -30,6 +30,7 @@ include("read_taxonomy.jl") taxonomy = read_taxonomy(tables_path) scinames = filter(r -> r.class == NCBITaxonomy.class_scientific_name, taxonomy) groupedscinames = groupby(scinames, :tax_id) +groupedtaxonomy = groupby(taxonomy, :tax_id) include("taxon.jl") export taxon, @ncbi_str diff --git a/src/utility/nametools.jl b/src/utility/nametools.jl index 6d08f98..0e1df35 100644 --- a/src/utility/nametools.jl +++ b/src/utility/nametools.jl @@ -35,8 +35,10 @@ end This function will return `nothing` if no authority exist, and a string with the authority if found. """ -function authority(t::NCBITaxon) - x = NCBITaxonomy.taxonomy[findall(NCBITaxonomy.taxonomy.tax_id .== t.id), :] - p = findall(isequal(NCBITaxonomy.class_authority), x.class) - return length(p) == 0 ? nothing : first(x.name[p]) +function authority(tax::NCBITaxon) + auth = filter(r -> r.class == NCBITaxonomy.class_authority, NCBITaxonomy.groupedtaxonomy[(tax_id = tax.id, )]) + if isempty(auth) + return nothing + end + return only(auth.name) end From 2653d793e437ecd9779af2adec9a998558a61c59 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Timoth=C3=A9e=20Poisot?= Date: Sat, 4 Mar 2023 21:00:45 -0500 Subject: [PATCH 21/25] =?UTF-8?q?=E2=9A=A1=20nametools?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/utility/nametools.jl | 36 ++++++++++++++++++++++-------------- 1 file changed, 22 insertions(+), 14 deletions(-) diff --git a/src/utility/nametools.jl b/src/utility/nametools.jl index 0e1df35..b1c83e3 100644 --- a/src/utility/nametools.jl +++ b/src/utility/nametools.jl @@ -5,16 +5,16 @@ This function will return `nothing` if no vernacular name is known, and an array of names if found. It searches the "common name" and "genbank common name" category of the NCBI taxonomy name table. """ -function vernacular(t::NCBITaxon) - x = NCBITaxonomy.taxonomy[findall(NCBITaxonomy.taxonomy.tax_id .== t.id), :] - p = findall( - !isnothing, - indexin( - x.class, - [NCBITaxonomy.class_common_name, NCBITaxonomy.class_genbank_common_name], - ), +function vernacular(tax::NCBITaxon) + vern = filter( + r -> r.class in + [NCBITaxonomy.class_common_name, NCBITaxonomy.class_genbank_common_name], + NCBITaxonomy.groupedtaxonomy[(tax_id = tax.id,)], ) - return length(p) == 0 ? nothing : x.name[p] + if isempty(vern) + return nothing + end + return vern.name end """ @@ -23,10 +23,15 @@ end This function will return `nothing` if no synonyms exist, and an array of names if they do. It returns all of the """ -function synonyms(t::NCBITaxon) - x = NCBITaxonomy.taxonomy[findall(NCBITaxonomy.taxonomy.tax_id .== t.id), :] - p = findall(isequal(NCBITaxonomy.class_synonym), x.class) - return length(p) == 0 ? nothing : x.name[p] +function synonyms(tax::NCBITaxon) + syno = filter( + r -> r.class == NCBITaxonomy.class_synonym, + NCBITaxonomy.groupedtaxonomy[(tax_id = tax.id,)], + ) + if isempty(syno) + return nothing + end + return syno.name end """ @@ -36,7 +41,10 @@ This function will return `nothing` if no authority exist, and a string with the authority if found. """ function authority(tax::NCBITaxon) - auth = filter(r -> r.class == NCBITaxonomy.class_authority, NCBITaxonomy.groupedtaxonomy[(tax_id = tax.id, )]) + auth = filter( + r -> r.class == NCBITaxonomy.class_authority, + NCBITaxonomy.groupedtaxonomy[(tax_id = tax.id,)], + ) if isempty(auth) return nothing end From 9180a319c4f9a6e84934616ca7b6084d550199ec Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Timoth=C3=A9e=20Poisot?= Date: Sat, 4 Mar 2023 21:05:57 -0500 Subject: [PATCH 22/25] =?UTF-8?q?=F0=9F=A6=86=20remove=20all=20union{missi?= =?UTF-8?q?ng}=20from=20the=20taxonomy=20table?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/read_taxonomy.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/read_taxonomy.jl b/src/read_taxonomy.jl index 226d691..bf1e9bd 100644 --- a/src/read_taxonomy.jl +++ b/src/read_taxonomy.jl @@ -30,7 +30,7 @@ function read_taxonomy(tables_path) GC.gc() select!(names_table, Not(:unique_name)) - #dropmissing!(names_table, [:rank, :parent_tax_id, :division_code]) + dropmissing!(names_table, [:rank, :parent_tax_id, :division_code]) return names_table From 4a04f3093e629f260bd44d0d64fd2b543f92be9d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Timoth=C3=A9e=20Poisot?= Date: Sat, 4 Mar 2023 21:23:20 -0500 Subject: [PATCH 23/25] =?UTF-8?q?=E2=9A=A1=20pre-split=20the=20division=20?= =?UTF-8?q?table?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/NCBITaxonomy.jl | 1 + src/namefilters/namefilter.jl | 4 ++-- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/src/NCBITaxonomy.jl b/src/NCBITaxonomy.jl index 2d4bf8d..ef7b91e 100644 --- a/src/NCBITaxonomy.jl +++ b/src/NCBITaxonomy.jl @@ -31,6 +31,7 @@ taxonomy = read_taxonomy(tables_path) scinames = filter(r -> r.class == NCBITaxonomy.class_scientific_name, taxonomy) groupedscinames = groupby(scinames, :tax_id) groupedtaxonomy = groupby(taxonomy, :tax_id) +divisions = groupby(taxonomy, :division_code) include("taxon.jl") export taxon, @ncbi_str diff --git a/src/namefilters/namefilter.jl b/src/namefilters/namefilter.jl index 2e43ac9..ae7f298 100644 --- a/src/namefilters/namefilter.jl +++ b/src/namefilters/namefilter.jl @@ -23,7 +23,7 @@ end Returns a subset of the names table for all names under a given NCBI division. """ function namefilter(division::Symbol) - return groupby(NCBITaxonomy.taxonomy, :division_code)[(division_code = division, )] + return NCBITaxonomy.divisions[(division_code = division,)] end """ @@ -33,5 +33,5 @@ Returns a subset of the names table for all names under a number of multiple NCBI divisions. """ function namefilter(division::Vector{Symbol}) - return vcat(groupby(NCBITaxonomy.taxonomy, :division_code)[[(division_code = div,) for div in division]]...) + return vcat(NCBITaxonomy.divisions[[(division_code = div,) for div in division]]...) end \ No newline at end of file From a1599a732055ea0f88eb555069d70bb9b21483f7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Timoth=C3=A9e=20Poisot?= Date: Sat, 4 Mar 2023 21:31:05 -0500 Subject: [PATCH 24/25] =?UTF-8?q?=F0=9F=A6=86=20remove=20Abstract=20from?= =?UTF-8?q?=20strings?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/taxon.jl | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/taxon.jl b/src/taxon.jl index 98dab01..df015d4 100644 --- a/src/taxon.jl +++ b/src/taxon.jl @@ -12,7 +12,7 @@ function taxon(id::Integer) ) end -function _id_from_name(name::AbstractString; kwargs...) +function _id_from_name(name::String; kwargs...) return _id_from_name(NCBITaxonomy.taxonomy, name; kwargs...) end @@ -86,7 +86,7 @@ The keywords are: - `onlysynonyms` (def. `false`) - limits the search to synonyms, which may be useful in case the taxonomy is particularly outdated """ -taxon(name::AbstractString; kwargs...) = taxon(NCBITaxonomy.taxonomy, name; kwargs...) +taxon(name::String; kwargs...) = taxon(NCBITaxonomy.taxonomy, name; kwargs...) """ taxon(df::DataFrame, name::AbstractString; kwargs...) From 3eda3e77bbcf0836523ab798cb3a7324fe6fb88d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Timoth=C3=A9e=20Poisot?= Date: Sat, 4 Mar 2023 21:55:05 -0500 Subject: [PATCH 25/25] =?UTF-8?q?=E2=9A=A1=20taxon=20by=20id=20speedup?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- benchmark/benchmarks.jl | 2 + src/NCBITaxonomy.jl | 1 + src/taxon.jl | 89 ++++++++++++++++++++++------------------- 3 files changed, 50 insertions(+), 42 deletions(-) diff --git a/benchmark/benchmarks.jl b/benchmark/benchmarks.jl index 65ed0e7..d2cef21 100644 --- a/benchmark/benchmarks.jl +++ b/benchmark/benchmarks.jl @@ -28,6 +28,8 @@ SUITE["name finders"]["mammals (inclusive)"] = @benchmarkable mammalfilter(true) SUITE["name finders"]["phage"] = @benchmarkable phagefilter() +SUITE["name finders"]["descendants of Diplectanidae"] = @benchmarkable descendantsfilter(ncbi"Diplectanidae") + # Ability to locate taxa SUITE["taxon search"] = BenchmarkGroup(["namefinding", "search"]) diff --git a/src/NCBITaxonomy.jl b/src/NCBITaxonomy.jl index ef7b91e..c1e9c5d 100644 --- a/src/NCBITaxonomy.jl +++ b/src/NCBITaxonomy.jl @@ -32,6 +32,7 @@ scinames = filter(r -> r.class == NCBITaxonomy.class_scientific_name, taxonomy) groupedscinames = groupby(scinames, :tax_id) groupedtaxonomy = groupby(taxonomy, :tax_id) divisions = groupby(taxonomy, :division_code) +ranks = groupby(taxonomy, :rank) include("taxon.jl") export taxon, @ncbi_str diff --git a/src/taxon.jl b/src/taxon.jl index df015d4..0cb3775 100644 --- a/src/taxon.jl +++ b/src/taxon.jl @@ -5,17 +5,50 @@ Performs a search in the entire taxonomy backbone based on a known ID. This is the fastest way to get to a taxon, and is used internally by the tree traversal methods. """ function taxon(id::Integer) - (id in NCBITaxonomy.scinames.tax_id) || throw(IDNotFoundInBackbone(id)) - return NCBITaxon( - only(NCBITaxonomy.groupedscinames[(tax_id = id,)].name), - only(NCBITaxonomy.groupedscinames[(tax_id = id,)].tax_id), - ) + try + m = only(NCBITaxonomy.groupedscinames[(tax_id = id,)]) + return NCBITaxon( + m.name, + m.tax_id, + ) + catch + throw(IDNotFoundInBackbone(id)) + end end function _id_from_name(name::String; kwargs...) return _id_from_name(NCBITaxonomy.taxonomy, name; kwargs...) end +function _strict_matches( + df::T, + name::String, + casesensitive::Bool, +) where {T <: AbstractDataFrame} + positions = if casesensitive + findall(==(name), df.name) + else + findall(==(lowercase(name)), df.lowercase) + end + isempty(positions) && return nothing + return positions +end + +function _fuzzy_matches( + df::T, + name::String, + casesensitive::Bool, + dist::Type{SD} +) where {T <: AbstractDataFrame, SD <: StringDistance} + positions = if casesensitive + last(findnearest(name, df.name, dist())) + else + last(findnearest(lowercase(name), df.lowercase, dist())) + end + isempty(positions) && return nothing + return positions +end + function _id_from_name( df::T, name::String; @@ -24,46 +57,18 @@ function _id_from_name( casesensitive::Bool = true, rank::Union{Nothing, Symbol} = nothing, preferscientific::Bool = false, - onlysynonyms::Bool = false, ) where {SD <: StringDistance, T <: AbstractDataFrame} - if !isnothing(rank) - @assert rank ∈ unique(df.rank) - df = df[findall(isequal(rank), df.rank), :] - end - if onlysynonyms - df = df[findall(isequal(NCBITaxonomy.class_synonym), df.class), :] - end - if strict - positions = if casesensitive - findall(isequal(name), df.name) - else - findall(isequal(lowercase(name)), df.lowercase) - end - # If the array is empty, we throw the "no name" error - isempty(positions) && throw(NameHasNoDirectMatch(name)) - # If the array has a single element, this is the ticket - length(positions) == 1 && return df.tax_id[first(positions)] - # If we prefer scientific names, we can filter with this - if preferscientific - if NCBITaxonomy.class_scientific_name in df.class[positions] - ids = df.tax_id[positions][findall( - isequal(NCBITaxonomy.class_scientific_name), - df.class[positions], - )] - if length(ids) == 1 - return first(ids) - else - throw(NameHasMultipleMatches(name, taxon.(ids))) - end - end - end - # If neither of these are satisfied, the name has multiple matches and we throw the appropriate error - taxa = taxon.(df.tax_id[positions]) - throw(NameHasMultipleMatches(name, taxa)) + # Perform the correct search + positions = if strict + _strict_matches(df, name, casesensitive) else - correct_name, position = findnearest(name, df.name, dist()) - return df.tax_id[position] + _fuzzy_matches(df, name, casesensitive, dist) end + length(positions) == 1 && return df.tax_id[only(positions)] + isempty(positions) && throw(NameHasNoDirectMatch(name)) + @info df[positions,:] + #taxa = taxon.(df.tax_id[positions]) + #throw(NameHasMultipleMatches(name, taxa)) end """