Skip to content

Commit

Permalink
update to v1.4.0
Browse files Browse the repository at this point in the history
  • Loading branch information
raufs committed May 14, 2024
1 parent 6a3bbd1 commit 10e9057
Show file tree
Hide file tree
Showing 10 changed files with 856 additions and 364 deletions.
8 changes: 4 additions & 4 deletions bin/fai
Original file line number Diff line number Diff line change
Expand Up @@ -110,7 +110,7 @@ def create_parser():
parser.add_argument('-kpq', '--key_protein_queries', help="Path to protein multi-FASTA file containing key query sequences\nwhich some proportin of are required to be present in a gene cluster\nat a specific E-value cutoff.", required=False, default=None)
parser.add_argument('-kpe', '--key_protein_evalue_cutoff', type=float, help='E-value cutoff for finding key query sequences in putative\ngene cluster homolog segments. [Default\nis 1e-20]. Disregarded if less strict than the\ngeneral --evalue cutoff.', required=False, default=1e-20)
parser.add_argument('-kpm', '--key_protein_min_prop', type=float, help='The minimum proportion of distinct ortholog groups matching\nkey proteins needed to report a homologous gene-cluster. [Default is 0.0].', required=False, default=0.0)
parser.add_argument('-sct', '--syntenic_correlation_threshold', type=float, help="The minimum syntenic correlation needed to at least one known\nGCF instance to report segment. [Default is 0.6]", required=False, default=0.6)
parser.add_argument('-sct', '--syntenic_correlation_threshold', type=float, help="The minimum syntenic correlation needed to at least one known\nGCF instance to report segment. [Default is 0.0]", required=False, default=0.0)
parser.add_argument('-gdm', '--gc_delineation_mode', help='Method/mode for delineation of gene-cluster boundaries. Options are\n"Gene-Clumper" or "HMM". Default is Gene-Clumper.', required=False, default="Gene-Clumper")
parser.add_argument('-f', '--flanking_context', type=int, help='Number of bases to append to the beginning/end of the gene-cluster\nsegments identified. [Default is 1000].', required=False, default=1000)
parser.add_argument('-mgd', '--max_genes_disconnect', type=int, help="Maximum number of genes between gene-cluster segments detected by HMM to\nmerge segments together. Alternatively the number of genes separating\nhits if Gene-Clumper mode is used. Allows for more inclusivity of novel\nauxiliary genes. [Default is 5].", required=False, default=5)
Expand Down Expand Up @@ -236,10 +236,10 @@ def faiMain():
"Filter for paralogous/gene-content overlapping segments",
"Use prodigal-gv instead of pyrodigal to perform gene calling for query region in reference genome",
"General E-value cutoff for detection of protein homologs in genome",
"Minimum proportion of hits for gene presence in genome", "FASTA file with key proteins to consider",
"Minimum proportion of distinct query proteins needed", "FASTA file with key proteins to consider",
"E-values for key proteins to be considered as high-supporting evidence for gene cluster presence",
"Minimum proportion of key protein hits to be considered as high-supporting evidence for gene cluster presence",
"Syntenic correlation to known instance threshold required for gene cluster presence",
"Minimum proportion of distinct key protein queries needed",
"Syntenic correlation to known instance threshold needed",
"Maximum distance in between candidate gene-cluster segments to perform merging",
"Base pair for flanking context to extract",
"Emission probability of gene being in gene-cluster state with homologous hit to gene-cluster",
Expand Down
26 changes: 18 additions & 8 deletions bin/prepTG
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,9 @@ import subprocess
import multiprocessing
import shutil
import gzip
import traceback

valid_gtdb_releases = set(['R214', 'R220'])
premade_db_set = set(['Acinetobacter', 'Bacillales', 'Corynebacterium', 'Enterobacter', 'Enterococcus', 'Escherichia',
'Klebsiella', 'Lactobacillus', 'Listeria', 'Micromonospora', 'Mycobacterium', 'Pseudomonas',
'Salmonella', 'Staphylococcus', 'Streptococcus', 'Neisseria', 'Streptomyces', 'Cutibacterium'])
Expand Down Expand Up @@ -118,7 +120,8 @@ def create_parser():

parser.add_argument('-d', '--download_premade', help='Download and setup pre-made databases of representative genomes for specific taxon/genus.\nProvide name of the taxon, e.g. "Escherichia"', required=False, default=None)
parser.add_argument('-i', '--input_dir', help='Directory with target genomes (either featuring GenBanks or FASTAs).', required=False, default=None)
parser.add_argument('-g', '--gtdb_taxon', help='Name of a GTDB-R214 valid genus or species to incorporate genomes from.\nShould be surrounded by\nquotes (e.g. "Escherichia coli").', required=False, default=None)
parser.add_argument('-g', '--gtdb_taxon', help='Name of a GTDB valid genus or species to incorporate genomes from.\nShould be surrounded by\nquotes (e.g. "Escherichia coli").', required=False, default=None)
parser.add_argument('-gr', '--gtdb_release', help='GTDB release to use. [Current default is R220].', default='R220', required=False)
parser.add_argument('-o', '--output_dir', help='Output directory, which can then be provided as input for the\n"-tg" argument in fai.', required=True)
parser.add_argument('-l', '--locus_tag_length', type=int, help='Length of locus tags to set. Default is 3, allows for <~18k genomes.', required=False, default=3)
parser.add_argument('-r', '--rename_locus_tags', action='store_true', help='Whether to rename locus tags if provided for CDS features in GenBanks.', required=False, default=False)
Expand Down Expand Up @@ -156,6 +159,7 @@ def prepTG():
outdir = os.path.abspath(myargs.output_dir) + '/'
locus_tag_length = myargs.locus_tag_length
rename_locus_tags = myargs.rename_locus_tags
gtdb_release = myargs.gtdb_release
cpus = myargs.cpus
max_memory = myargs.max_memory
gene_calling_method = myargs.gene_calling_method
Expand Down Expand Up @@ -219,26 +223,32 @@ def prepTG():
logObject.info('Running prepTG version %s' % version)
sys.stdout.write('Running prepTG version %s\n' % version)


gtdb_genomes_dir = outdir + 'GTDB_Genomes/'
gtdb_genomes_listing_file = outdir + 'GTDB_Genomes_Listing.txt'
if gtdb_taxon != None:
# Step 0: Download GTDB listing file from lsaBGC git repo, parse GTDB information
# file, get list of Genbank accessions, and perform dry-run with ncbi-genome-download
# if requested.
sys.stdout.write("Using axel to download the GTDB R214 listing file from the lsaBGC git repo.\n")
gtdb_listing_file = outdir + "GTDB_R214_Information.txt.gz"
wget_cmd = ['axel', 'https://github.com/Kalan-Lab/lsaBGC/raw/main/db/GTDB_R214_Information.txt.gz', '-o', gtdb_listing_file]
gtdb_release = gtdb_release.upper()
try:
assert(gtdb_release in valid_gtdb_releases)
except:
sys.stderr.write('GTDB release specified is not valid, options include: ' + ', '.join(valid_gtdb_releases))
sys.exit(1)

sys.stdout.write("Using axel to download the GTDB " + gtdb_release + " listing file.\n")
gtdb_listing_file = outdir + "GTDB_" + gtdb_release + "_Information.txt.gz"
wget_cmd = ['axel', 'https://github.com/raufs/gtdb_gca_to_taxa_mappings/raw/main/GTDB_' + gtdb_release + '_Information.txt.gz', '-o', gtdb_listing_file]

try:
subprocess.call(' '.join(wget_cmd), shell=True, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL, executable='/bin/bash')
assert (os.path.isfile(gtdb_listing_file))
except:
sys.stderr.write('Had an issue running: %s\n' % ' '.join(cmd))
sys.stderr.write('Had an issue running: %s\n' % ' '.join(wget_cmd))
sys.stderr.write(traceback.format_exc())
sys.exit(1)

genbank_accession_listing_file = outdir + 'NCBI_Genbank_Accession_Listing.txt'
sys.stdout.write("--------------------\nStep 0\n--------------------\nBeginning by assessing which genomic assemblies are available for the taxa %s in GTDB and NCBI's Genbank db\n" % gtdb_taxon)
sys.stdout.write("--------------------\nStep 0\n--------------------\nBeginning by assessing which genomic assemblies are available for the taxa %s in GTDB %s and NCBI's Genbank db\n" % (gtdb_taxon, gtdb_release))

if not os.path.isfile(genbank_accession_listing_file):
genbank_accession_listing_handle = open(genbank_accession_listing_file, 'w')
Expand Down
34 changes: 24 additions & 10 deletions bin/zol
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,7 @@ import shutil
import subprocess
import pickle
from datetime import datetime
import traceback

def create_parser():
""" Parse arguments """
Expand Down Expand Up @@ -96,9 +97,12 @@ def create_parser():
parser.add_argument('-r', '--rename_lt', action='store_true', help="Rename locus-tags for CDS features in GenBanks.", required=False, default=False)
parser.add_argument('-d', '--dereplicate', action='store_true', help='Perform dereplication of input GenBanks using skani\nand single-linkage clustering or MCL.', required=False, default=False)
parser.add_argument('-ri', '--reinflate', action='store_true', help='Perform re-inflation with all gene-clusters of\northo-groups identified via dereplicated analysis.', required=False, default=False)
parser.add_argument('-cdo', '--cdhit_orthogroup', action='store_true', help='Cluster proteins using CD-HIT instead of using the\nstandard InParanoid-like ortholog group prediction approach.\nThis approach is faster and uses less memory, but is less accurate.', required=False, default=False)
parser.add_argument('-cdp', '--cdhit_params', help='Parameters for performing CD-HIT based ortholog group\nclustering if requested via --cdhit_orthogroup.\n[Default is "-c 0.5 -aL 0.25 -aS 0.5 -n 3 -M 4000"]', required=False, default="-c 0.5 -aL 0.25 -aS 0.5 -n 3 -M 4000")
parser.add_argument('-dt', '--derep_identity', type=float, help='skani ANI threshold to use for dereplication. [Default is 99.0].', required=False, default=99.0)
parser.add_argument('-dc', '--derep_coverage', type=float, help='skani aligned fraction threshold to use for\ndereplication. [Default is 95.0].', required=False, default=95.0)
parser.add_argument('-di', '--derep_inflation', type=float, help='Inflation parameter for MCL to use for dereplication of\ngene-clusters. If not specified single-linkage clustering\nwill be used instead.', required=False, default=None)
parser.add_argument('-rp', '--reinflate_params', help='Parameters for running CD-HIT for re-inflation, please\nsurround argument input with double quotes.\n[Default is "-c 0.98 -aL 0.95 -aS 0.95 -n 5 -M 4000"].', required=False, default="-c 0.98 -aL 0.95 -aS 0.95 -n 5 -M 4000")
parser.add_argument('-ibc', '--impute_broad_conservation', action='store_true', help='Impute weighted conservation stats based on cluster size associated\nwith dereplicated representatives.')
parser.add_argument('-ces', '--comprehensive_evo_stats', action='store_true', help='Allow computing of evolutionary statistics for non-single-copy genes.', required=False, default=False)
parser.add_argument('-aec', '--allow_edge_cds', action='store_true', help='Allow CDS within gene-cluster GenBanks with the attribute\n"near_scaffold_edge=True", which is set by fai for features\nwithin 2kb of contig edges.', required=False, default=False)
Expand Down Expand Up @@ -156,6 +160,7 @@ def zolMain():
ces_flag = myargs.comprehensive_evo_stats
dereplicate_flag = myargs.dereplicate
reinflate_flag = myargs.reinflate
reinflate_params = myargs.reinflate_params
derep_identity = myargs.derep_identity
derep_coverage = myargs.derep_coverage
derep_inflation = myargs.derep_inflation
Expand All @@ -165,6 +170,8 @@ def zolMain():
allow_edge_cds_flag = myargs.allow_edge_cds
refine_gene_calling_flag = myargs.refine_gene_calling
only_orthogroups_flag = myargs.only_orthogroups
cdhit_orthogroup_flag = myargs.cdhit_orthogroup
cdhit_orthogroup_params = myargs.cdhit_params
max_memory = myargs.max_memory

try:
Expand Down Expand Up @@ -206,12 +213,15 @@ def zolMain():

logObject.info("Saving parameters for future records.")
parameters_file = outdir + 'Parameter_Inputs.txt'
parameter_values = [input_dir, outdir, select_fai_params_mode, identity_threshold, coverage_threshold, evalue_threshold, use_super5,
fubar_selection, skip_gard, focal_genbanks_listing_file, comparator_genbanks_listing_file, filter_lq_flag,
filter_dq_flag, only_orthogroups_flag, ibc_flag, ces_flag, rename_lt_flag, allow_edge_cds_flag, dereplicate_flag,
reinflate_flag, derep_identity, derep_coverage, derep_inflation, custom_database, refine_gene_calling_flag,
length, width,full_genbank_labels, cpus, max_memory]
parameter_values = [input_dir, outdir, select_fai_params_mode, cdhit_orthogroup_flag, cdhit_orthogroup_params,
identity_threshold, coverage_threshold, evalue_threshold, use_super5, fubar_selection, skip_gard,
focal_genbanks_listing_file, comparator_genbanks_listing_file, filter_lq_flag, filter_dq_flag,
only_orthogroups_flag, ibc_flag, ces_flag, rename_lt_flag, allow_edge_cds_flag, dereplicate_flag,
reinflate_flag, reinflate_params, derep_identity, derep_coverage, derep_inflation, custom_database,
refine_gene_calling_flag, length, width,full_genbank_labels, cpus, max_memory]
parameter_names = ["Input directory with loci GenBanks", "Output directory", "Select fai parameters mode?",
"Perform iterative CD-HIT based orthogrouping instead of default InParanoid-based approach?",
"CD-HIT parameters for orthogrouping (assuming approach requested)",
"Ortholog group finding identity threshold", "Ortholog group finding coverage threshold",
"Ortholog group finding E-value threshold", "Use super5 mode in MUSCLE alignments?",
"Run FUBAR selection analyses?", "Skip GARD partitioning by recombination breakpoints?",
Expand All @@ -220,11 +230,11 @@ def zolMain():
"Perform broad level estimation of ortholog group conservation if dereplication requested?",
"Comprehensive reporting of evolutionary statistics, including for non-single copy ortholog groups",
"Rename locus tags?", "Use CDS features with attribute near_scaffold_edge=True.",
"Perform Dereplication?", "Perform reinflation?", "Dereplication identity threshold",
"Dereplication coverage threshold", "Dereplication clustering method / MCL inflation",
"Custom annotation database", "Refine gene calling using the custom annotation database",
"Plot height", "Plot width", "Use full GenBank labels?", "Number of CPUs requested",
"Maximum memory in GB"]
"Perform Dereplication?", "Perform reinflation?", "Reinflation CD-HIT parameters",
"Dereplication identity threshold", "Dereplication coverage threshold",
"Dereplication clustering method / MCL inflation", "Custom annotation database",
"Refine gene calling using the custom annotation database", "Plot height", "Plot width",
"Use full GenBank labels?", "Number of CPUs requested", "Maximum memory in GB"]
util.logParametersToFile(parameters_file, parameter_names, parameter_values)
logObject.info("Done saving parameters!")

Expand Down Expand Up @@ -376,6 +386,9 @@ def zolMain():
sys.exit(1)
fo_cmd = ['findOrthologs.py', '-p', fo_prot_dir, '-o', og_dir, '-e', str(evalue_threshold), '-i',
str(identity_threshold), '-q', str(coverage_threshold), '-c', str(cpus)]
if cdhit_orthogroup_flag:
fo_cmd = ['findOrthologs.py', '-p', fo_prot_dir, '-o', og_dir, '-cd',
'-cdp="' + cdhit_orthogroup_params + '"', '-c', str(cpus)]
try:
subprocess.call(' '.join(fo_cmd), shell=True, stdout=subprocess.DEVNULL,
stderr=subprocess.DEVNULL, executable='/bin/bash')
Expand All @@ -392,6 +405,7 @@ def zolMain():
except Exception as e:
sys.stderr.write('Issues with determining ortholog/ortholog groups!\nThis is likely because no core ortholog groups were identified, please consider filtering\nlow gene-cluster instances or adjusting clustering parameters!\n')
logObject.error('Issues with determining ortholog/ortholog groups!')
sys.stderr.write(traceback.format_exc() + '\n')
sys.stderr.write(str(e) + '\n')
sys.exit(1)
os.system('touch %s' % step2_check_file)
Expand Down
Loading

0 comments on commit 10e9057

Please sign in to comment.