Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Performance improvements with smarter handling of raw data #57

Open
wants to merge 27 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
27 commits
Select commit Hold shift + click to select a range
57d3c10
🛑 sanitize the download/extract step
tpoisot Mar 4, 2023
c854aca
🚮 only check the local checksum after making sure the archive is here
tpoisot Mar 4, 2023
eae7618
✨ build step cleaned up
tpoisot Mar 4, 2023
1683801
⚙️ move the code to read the taxonomy out of the main file
tpoisot Mar 4, 2023
36306cf
⚡ only keep two taxo dbs
tpoisot Mar 4, 2023
9340b85
➕ Downloads (tmp. Revise)
tpoisot Mar 4, 2023
b65d0dd
⚡ speedy lookup by ID
tpoisot Mar 4, 2023
4933e83
⚙️ use filter in namefilter
tpoisot Mar 4, 2023
dd430bc
⚙️ keep division code
tpoisot Mar 4, 2023
efd0902
⚡ direct lookup for rank
tpoisot Mar 4, 2023
b276c7b
🐛 namefilter from df filter
tpoisot Mar 4, 2023
518faa9
🐛 index by tuple
tpoisot Mar 4, 2023
276de15
⚡ update some benchmarks
tpoisot Mar 4, 2023
1e81706
⚡ update the speed of namefinders
tpoisot Mar 4, 2023
c749b0d
⚡ minimize memory footprint of the table
tpoisot Mar 4, 2023
4db9187
Merge branch 'main' into feature/speedup-groupdb
tpoisot Mar 4, 2023
01a7ca7
🐛 make taxon work with abstract dfs
tpoisot Mar 4, 2023
87a95f2
Merge branch 'feature/speedup-groupdb' of https://github.com/PoisotLa…
tpoisot Mar 4, 2023
3ebcc5a
🦆 make sure name is a String
tpoisot Mar 4, 2023
46f8a52
🚧 allow missings
tpoisot Mar 4, 2023
79e2dde
🐛 return data frame for filtering on multiple divisions
tpoisot Mar 5, 2023
de7c441
⚡ authority
tpoisot Mar 5, 2023
2653d79
⚡ nametools
tpoisot Mar 5, 2023
9180a31
🦆 remove all union{missing} from the taxonomy table
tpoisot Mar 5, 2023
4a04f30
⚡ pre-split the division table
tpoisot Mar 5, 2023
a1599a7
🦆 remove Abstract from strings
tpoisot Mar 5, 2023
3eda3e7
⚡ taxon by id speedup
tpoisot Mar 5, 2023
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions Project.toml
Original file line number Diff line number Diff line change
Expand Up @@ -7,8 +7,10 @@ version = "0.4.1"
AbstractTrees = "1520ce14-60c1-5f80-bbc7-55ef81b5835c"
Arrow = "69666777-d1a9-59fb-9406-91d4454c9d45"
DataFrames = "a93c6f00-e57d-5684-b7b6-d8193f3e46c0"
Downloads = "f43a241f-c20a-4ad4-852c-f6b1247861c6"
GZip = "92fee26a-97fe-5a0c-ad85-20a5f3185b63"
MD5 = "6ac74813-4b46-53a4-afec-0b5dc9d7885c"
Revise = "295af30f-e4ad-537b-8983-00126c2a3abe"
StringDistances = "88034a9c-02f8-509d-84a9-84ec65e18404"
Tar = "a4e569a6-e804-4fa4-b0f3-eef7a1d5b13e"

Expand Down
2 changes: 2 additions & 0 deletions benchmark/benchmarks.jl
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,8 @@ SUITE["name finders"]["mammals (inclusive)"] = @benchmarkable mammalfilter(true)

SUITE["name finders"]["phage"] = @benchmarkable phagefilter()

SUITE["name finders"]["descendants of Diplectanidae"] = @benchmarkable descendantsfilter(ncbi"Diplectanidae")

# Ability to locate taxa

SUITE["taxon search"] = BenchmarkGroup(["namefinding", "search"])
Expand Down
139 changes: 51 additions & 88 deletions deps/build.jl
Original file line number Diff line number Diff line change
Expand Up @@ -3,71 +3,25 @@ import GZip
import Tar
import Arrow
import DataFrames
import Downloads

# URL for the taxonomy dump
const ncbi_ftp = "https://ftp.ncbi.nlm.nih.gov/pub/taxonomy/new_taxdump/"
const archive = ncbi_ftp * "new_taxdump.tar.gz"
const checksum = archive * ".md5"

if !haskey(ENV, "NCBITAXONOMY_PATH")
@warn """
The environmental variable NCBITAXONOMY_PATH is not set, so the tables will
be stored in your home directory. This is not ideal, and you really should set
the NCBITAXONOMY_PATH.

This can be done by adding `ENV["NCBITAXONOMY_PATH"]` in your Julia startup
file. The path will be created automatically if it does not exist.
"""
end
const taxpath = get(ENV, "NCBITAXONOMY_PATH", joinpath(homedir(), "NCBITaxonomy"))
ispath(taxpath) || mkpath(taxpath)

chk_file = download(checksum)
chk = split(readlines(chk_file)[1], " ")[1]
@info "Checksum of the most recent NCBI taxonomy: $(chk)"

function download_dump(url, chk, dest)
@info "Downloading the taxonomy data from $(url)"
if ispath(joinpath(taxpath, dest))
@info "Removing the previous version of the taxonomy"
rm(joinpath(taxpath, dest); force=true, recursive=true)
mkpath(joinpath(taxpath, dest))
else
mkpath(joinpath(taxpath, dest))
end
arc = download(url)
vrf = bytes2hex(open(MD5.md5, arc))
vrf == chk || throw(ErrorException("Wrong checksum for the NCBI taxonomy archive file - unable to download"))
write(joinpath(taxpath, ".checksum"), vrf)
Tar.extract(GZip.open(arc), joinpath(taxpath, dest))
end

# The next block is about making sure that we don't download something that has
# not changed when we build the package. The taxonomy dump is not gigantic, but
# there is no need to get it over and over again.
if !isfile(joinpath(taxpath, ".checksum"))
@info "No local taxonomy checksum found"
download_dump(archive, chk, "dump")
else
local_chk = readline(joinpath(taxpath, ".checksum"))
if local_chk != chk
@info "Local and remote checksum do not match"
download_dump(archive, chk, "dump")
else
@info "Local taxonomy dump ($(local_chk)) is up to date"
end
end
# There are two things we need for the build process: the types, and the
# location of the files
include(joinpath(@__DIR__, "hydrate.jl"))
include(joinpath(@__DIR__, "..", "src", "types.jl"))
include(joinpath(@__DIR__, "..", "src", "local_archive_path.jl"))

@info "Materializing the taxonomy"
# These steps are meant to download and unpack the taxonomy as needed, which is
# to say as unfrequently as possible
remote_info = _remote_archive_path()
local_path = _local_archive_path()
remote_checksum = _get_current_remote_checksum(local_path, remote_info)
local_archive = _unpack_if_needed(local_path, remote_info, remote_checksum)

# We will store the tables used by the package in the tables folder
tables = joinpath(taxpath, "tables")
ispath(tables) || mkpath(tables)
tables_path = _create_or_get_tables_path(local_path)

# Utility functions

include(joinpath(@__DIR__, "..", "src", "types.jl"))

function _class_to_enum(c::T) where {T <: String}
c = replace(c, " " => "_")
c = replace(c, "-" => "_")
Expand All @@ -84,8 +38,8 @@ issue.
function _materialize_data(::Type{T}, v) where {T}
if v != ""
T <: Number && return parse(T, v)
T <: Union{Bool,Missing} && return parse(Bool, v)
T <: Union{Int,Missing} && return parse(Int, v)
T <: Union{Bool, Missing} && return parse(Bool, v)
T <: Union{Int, Missing} && return parse(Int, v)
T <: Symbol && return Symbol(v)
T <: NCBINameClass && return _class_to_enum(v)
return v
Expand All @@ -107,44 +61,53 @@ function _build_arrow_file(df, dump_file)
return df
end

# Get the data

@info "Building the names file"
ncbi_names_file_in = joinpath(taxpath, "dump", "names.dmp")
ncbi_names_file_out = joinpath(taxpath, "tables", "names.arrow")
ncbi_names = DataFrames.DataFrame(tax_id=Int[], name=String[], unique_name=Union{String,Missing}[], class=NCBINameClass[])
# Get the data for the names
ncbi_names_file_in = joinpath(local_path, "dump", "names.dmp")
ncbi_names_file_out = joinpath(tables_path, "names.arrow")
ncbi_names = DataFrames.DataFrame(;
tax_id = Int[],
name = String[],
unique_name = Union{String, Missing}[],
class = NCBINameClass[],
)
names_df = _build_arrow_file(ncbi_names, ncbi_names_file_in)
names_df.class = Int.(names_df.class)
Arrow.write(ncbi_names_file_out, names_df)
names_df = nothing
GC.gc()

@info "Building the division file"
ncbi_division_file_in = joinpath(taxpath, "dump", "division.dmp")
ncbi_division_file_out = joinpath(taxpath, "tables", "division.arrow")
ncbi_division = DataFrames.DataFrame(division_id=Int[], division_code=Symbol[], division_name=Symbol[], comments=Union{String,Missing}[])
ncbi_division_file_in = joinpath(local_path, "dump", "division.dmp")
ncbi_division_file_out = joinpath(tables_path, "division.arrow")
ncbi_division = DataFrames.DataFrame(;
division_id = Int[],
division_code = Symbol[],
division_name = Symbol[],
comments = Union{String, Missing}[],
)
division_df = _build_arrow_file(ncbi_division, ncbi_division_file_in)
Arrow.write(ncbi_division_file_out, division_df)
division_df = nothing
GC.gc()

@info "Building the nodes file"
ncbi_nodes_file_in = joinpath(taxpath, "dump", "nodes.dmp")
ncbi_nodes_file_out = joinpath(taxpath, "tables", "nodes.arrow")
ncbi_nodes = DataFrames.DataFrame(
tax_id=Int[], parent_tax_id=Int[],
rank=Symbol[],
embl=Union{String,Missing}[],
division_id=Int[], inherited_div=Union{Bool,Missing}[],
genetic_code_id=Int[], inherited_gc=Union{Bool,Missing}[],
mitochondrial_genetic_code_id=Union{Int,Missing}[], inherited_mgc=Union{Bool,Missing}[],
genbank_hidden=Union{Bool,Missing}[],
hidden_subtree=Union{Bool,Missing}[],
comments=Union{String,Missing}[],
plastid_genetic_code_id=Union{Int,Missing}[], inherited_pgc=Union{Bool,Missing}[],
specified_species=Union{Bool,Missing}[],
hydrogenosome_code_id=Union{Int,Missing}[], inherited_hgc=Union{Bool,Missing}[]
)
ncbi_nodes_file_in = joinpath(local_path, "dump", "nodes.dmp")
ncbi_nodes_file_out = joinpath(tables_path, "nodes.arrow")
ncbi_nodes = DataFrames.DataFrame(;
tax_id = Int[], parent_tax_id = Int[],
rank = Symbol[],
embl = Union{String, Missing}[],
division_id = Int[], inherited_div = Union{Bool, Missing}[],
genetic_code_id = Int[], inherited_gc = Union{Bool, Missing}[],
mitochondrial_genetic_code_id = Union{Int, Missing}[],
inherited_mgc = Union{Bool, Missing}[],
genbank_hidden = Union{Bool, Missing}[],
hidden_subtree = Union{Bool, Missing}[],
comments = Union{String, Missing}[],
plastid_genetic_code_id = Union{Int, Missing}[],
inherited_pgc = Union{Bool, Missing}[],
specified_species = Union{Bool, Missing}[],
hydrogenosome_code_id = Union{Int, Missing}[],
inherited_hgc = Union{Bool, Missing}[],
)
nodes_df = _build_arrow_file(ncbi_nodes, ncbi_nodes_file_in)
Arrow.write(ncbi_nodes_file_out, nodes_df)
nodes_df = nothing
Expand Down
57 changes: 57 additions & 0 deletions deps/hydrate.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
function _remote_archive_path(;
ncbi_ftp = "https://ftp.ncbi.nlm.nih.gov/pub/taxonomy/new_taxdump/",
)::NamedTuple{
(:url, :archive, :checksum),
Tuple{String, String, String},
}
return (
url = ncbi_ftp,
archive = "new_taxdump.tar.gz",
checksum = "new_taxdump.tar.gz.md5",
)
end

function _get_current_remote_checksum(local_path, remote_info)
chk_file = Downloads.download(
remote_info.url * remote_info.checksum,
joinpath(local_path, ".checksum.remote"),
)
return split(readlines(chk_file)[1], " ")[1]
end

function _download_archive(local_path, remote_info)
Downloads.download(
remote_info.url * remote_info.archive,
joinpath(local_path, remote_info.archive),
)
return joinpath(local_path, remote_info.archive)
end

function _unpack_if_needed(local_path, remote_info, remote_checksum)
local_archive = joinpath(local_path, remote_info.archive)
need_update = false
if ~isfile(local_archive)
@warn "There is no local taxonomy dump, we will download one"
local_archive = _download_archive(local_path, remote_info)
need_update = true
end
local_checksum = bytes2hex(open(MD5.md5, local_archive))
if local_checksum != remote_checksum
@warn "The checksum of the taxonomy dump does not match the remote"
local_archive = _download_archive(local_path, remote_info)
local_checksum = bytes2hex(open(MD5.md5, local_archive))
need_update = true
end
if need_update
@warn "We are unpacking the local taxonomy dump"
rm(joinpath(local_path, "dump"); force=true)
Tar.extract(GZip.open(local_archive), joinpath(local_path, "dump"))
end
return joinpath(local_path, "dump")
end

function _create_or_get_tables_path(local_path)
tables_path = joinpath(local_path, "tables")
ispath(tables_path) || mkpath(tables_path)
return tables_path
end
67 changes: 23 additions & 44 deletions src/NCBITaxonomy.jl
Original file line number Diff line number Diff line change
@@ -1,59 +1,38 @@
module NCBITaxonomy
using DataFrames
using Arrow
import Arrow
using StringDistances
using AbstractTrees

if !haskey(ENV, "NCBITAXONOMY_PATH")
@warn """
The environmental variable NCBITAXONOMY_PATH is not set, so the tables will
be stored in the package path. This is not ideal, and you really should set
the NCBITAXONOMY_PATH.

This can be done by adding `ENV["NCBITAXONOMY_PATH"]` in your Julia startup
file. The path will be created automatically if it does not exist.
"""
end
const taxpath = get(ENV, "NCBITAXONOMY_PATH", joinpath(homedir(), "NCBITaxonomy"))
ispath(taxpath) || mkpath(taxpath)
# Point to where the taxonomy is located
include("local_archive_path.jl")
tables_path = _create_or_get_tables_path(_local_archive_path())

function __init__()
name_date = mtime(joinpath(taxpath, "tables", "names.arrow"))
return time() - name_date >= 2.6e+6 && @warn(
"Your local taxonomy version is over 30 days old, we recommend using `] build NCBITaxonomy` to get the most recent version."
)
name_date = mtime(joinpath(tables_path, "names.arrow"))
over_30_days = time() - name_date >= 2.6e+6
if over_30_days
@warn(
"Your local taxonomy version is over 30 days old, we recommend using `] build NCBITaxonomy` to get the most recent version."
)
end
return nothing
end

include("types.jl")
export NCBITaxon, NCBINameClass, IDNotFoundInBackbone
export NCBITaxon, NCBINameClass

include("exceptions.jl")
export NameHasNoDirectMatch, NameHasMultipleMatches

names_table = DataFrame(Arrow.Table(joinpath(taxpath, "tables", "names.arrow")))
names_table.class = NCBINameClass.(names_table.class)
names_table.lowercase = lowercase.(names_table.name)

division_table = DataFrame(Arrow.Table(joinpath(taxpath, "tables", "division.arrow")))
select!(division_table, Not(:comments))

nodes_table = DataFrame(Arrow.Table(joinpath(taxpath, "tables", "nodes.arrow")))
select!(nodes_table, Not(r"inherited_"))
select!(nodes_table, Not(r"_code_id"))
select!(nodes_table, Not(:genbank_hidden))
select!(nodes_table, Not(:hidden_subtree))
select!(nodes_table, Not(:comments))
select!(nodes_table, Not(:embl))

nodes_table = innerjoin(nodes_table, division_table; on = :division_id)
select!(nodes_table, Not(:division_id))

names_table = leftjoin(
names_table,
unique(select(nodes_table, [:tax_id, :rank, :parent_tax_id]));
on = :tax_id,
)
scinames_table = names_table[findall(names_table.class .== class_scientific_name), :]
export NameHasNoDirectMatch, NameHasMultipleMatches, IDNotFoundInBackbone

# We load the core file with all we need in it
include("read_taxonomy.jl")
taxonomy = read_taxonomy(tables_path)
scinames = filter(r -> r.class == NCBITaxonomy.class_scientific_name, taxonomy)
groupedscinames = groupby(scinames, :tax_id)
groupedtaxonomy = groupby(taxonomy, :tax_id)
divisions = groupby(taxonomy, :division_code)
ranks = groupby(taxonomy, :rank)

include("taxon.jl")
export taxon, @ncbi_str
Expand Down
16 changes: 8 additions & 8 deletions src/interfaces/abstracttrees.jl
Original file line number Diff line number Diff line change
Expand Up @@ -4,13 +4,13 @@
Returns the children of a taxon.
"""
function AbstractTrees.children(tax::NCBITaxon)
positions = findall(isequal(tax.id), NCBITaxonomy.scinames_table.parent_tax_id)
positions = findall(isequal(tax.id), NCBITaxonomy.scinames.parent_tax_id)
if ~isempty(positions)
list_of_children = Vector{NCBITaxon}(undef, length(positions))
for i in axes(positions, 1)
list_of_children[i] = NCBITaxon(
NCBITaxonomy.scinames_table.name[positions[i]],
NCBITaxonomy.scinames_table.tax_id[positions[i]],
NCBITaxonomy.scinames.name[positions[i]],
NCBITaxonomy.scinames.tax_id[positions[i]],
)
end
return list_of_children
Expand All @@ -25,15 +25,15 @@ end
Returns the taxon from which the argument taxon is descended.
"""
function AbstractTrees.parent(tax::NCBITaxon)
position = findfirst(isequal(tax.id), NCBITaxonomy.scinames_table.tax_id)
position = findfirst(isequal(tax.id), NCBITaxonomy.scinames.tax_id)
if ~isnothing(position)
parent_position = findfirst(
isequal(NCBITaxonomy.scinames_table.parent_tax_id[position]),
NCBITaxonomy.scinames_table.tax_id,
isequal(NCBITaxonomy.scinames.parent_tax_id[position]),
NCBITaxonomy.scinames.tax_id,
)
return NCBITaxon(
NCBITaxonomy.scinames_table.name[parent_position],
NCBITaxonomy.scinames_table.tax_id[parent_position],
NCBITaxonomy.scinames.name[parent_position],
NCBITaxonomy.scinames.tax_id[parent_position],
)
else
return nothing
Expand Down
3 changes: 1 addition & 2 deletions src/lineage/rank.jl
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,5 @@
Returns the rank of a taxon.
"""
function rank(tax::NCBITaxon)
position = findfirst(isequal(tax.id), NCBITaxonomy.nodes_table.tax_id)
return NCBITaxonomy.nodes_table.rank[position]
return only(NCBITaxonomy.groupedscinames[(tax_id = tax.id,)].rank)
end
Loading