update to v1.4.0

Kalan-Lab · May 14, 2024 · 10e9057 · 10e9057
1 parent 6a3bbd1
commit 10e9057
Show file tree

Hide file tree

Showing 10 changed files with 856 additions and 364 deletions.
diff --git a/bin/fai b/bin/fai
@@ -110,7 +110,7 @@ def create_parser():
 	parser.add_argument('-kpq', '--key_protein_queries', help="Path to protein multi-FASTA file containing key query sequences\nwhich some proportin of are required to be present in a gene cluster\nat a specific E-value cutoff.", required=False, default=None)
 	parser.add_argument('-kpe', '--key_protein_evalue_cutoff', type=float, help='E-value cutoff for finding key query sequences in putative\ngene cluster homolog segments. [Default\nis 1e-20]. Disregarded if less strict than the\ngeneral --evalue cutoff.', required=False, default=1e-20)
 	parser.add_argument('-kpm', '--key_protein_min_prop', type=float, help='The minimum proportion of distinct ortholog groups matching\nkey proteins needed to report a homologous gene-cluster. [Default is 0.0].', required=False, default=0.0)
-	parser.add_argument('-sct', '--syntenic_correlation_threshold', type=float, help="The minimum syntenic correlation needed to at least one known\nGCF instance to report segment. [Default is 0.6]", required=False, default=0.6)
+	parser.add_argument('-sct', '--syntenic_correlation_threshold', type=float, help="The minimum syntenic correlation needed to at least one known\nGCF instance to report segment. [Default is 0.0]", required=False, default=0.0)
 	parser.add_argument('-gdm', '--gc_delineation_mode', help='Method/mode for delineation of gene-cluster boundaries. Options are\n"Gene-Clumper" or "HMM". Default is Gene-Clumper.', required=False, default="Gene-Clumper")
 	parser.add_argument('-f', '--flanking_context', type=int, help='Number of bases to append to the beginning/end of the gene-cluster\nsegments identified. [Default is 1000].', required=False, default=1000)
 	parser.add_argument('-mgd', '--max_genes_disconnect', type=int, help="Maximum number of genes between gene-cluster segments detected by HMM to\nmerge segments together. Alternatively the number of genes separating\nhits if Gene-Clumper mode is used. Allows for more inclusivity of novel\nauxiliary genes. [Default is 5].", required=False, default=5)
@@ -236,10 +236,10 @@ def faiMain():
 					   "Filter for paralogous/gene-content overlapping segments",
 					   "Use prodigal-gv instead of pyrodigal to perform gene calling for query region in reference genome",
 					   "General E-value cutoff for detection of protein homologs in genome",
-					   "Minimum proportion of hits for gene presence in genome", "FASTA file with key proteins to consider",
+					   "Minimum proportion of distinct query proteins needed", "FASTA file with key proteins to consider",
 					   "E-values for key proteins to be considered as high-supporting evidence for gene cluster presence",
-					   "Minimum proportion of key protein hits to be considered as high-supporting evidence for gene cluster presence",
-					   "Syntenic correlation to known instance threshold required for gene cluster presence",
+					   "Minimum proportion of distinct key protein queries needed",
+					   "Syntenic correlation to known instance threshold needed",
 					   "Maximum distance in between candidate gene-cluster segments to perform merging",
 					   "Base pair for flanking context to extract",
 					   "Emission probability of gene being in gene-cluster state with homologous hit to gene-cluster",

diff --git a/bin/prepTG b/bin/prepTG
@@ -45,7 +45,9 @@ import subprocess
 import multiprocessing
 import shutil
 import gzip
+import traceback
 
+valid_gtdb_releases = set(['R214', 'R220'])
 premade_db_set = set(['Acinetobacter', 'Bacillales', 'Corynebacterium', 'Enterobacter', 'Enterococcus', 'Escherichia', 
                       'Klebsiella', 'Lactobacillus', 'Listeria', 'Micromonospora', 'Mycobacterium', 'Pseudomonas', 
 					  'Salmonella', 'Staphylococcus', 'Streptococcus', 'Neisseria', 'Streptomyces', 'Cutibacterium'])
@@ -118,7 +120,8 @@ def create_parser():
 
 	parser.add_argument('-d', '--download_premade', help='Download and setup pre-made databases of representative genomes for specific taxon/genus.\nProvide name of the taxon, e.g. "Escherichia"', required=False, default=None)
 	parser.add_argument('-i', '--input_dir', help='Directory with target genomes (either featuring GenBanks or FASTAs).', required=False, default=None)
-	parser.add_argument('-g', '--gtdb_taxon', help='Name of a GTDB-R214 valid genus or species to incorporate genomes from.\nShould be surrounded by\nquotes (e.g. "Escherichia coli").', required=False, default=None)
+	parser.add_argument('-g', '--gtdb_taxon', help='Name of a GTDB valid genus or species to incorporate genomes from.\nShould be surrounded by\nquotes (e.g. "Escherichia coli").', required=False, default=None)
+	parser.add_argument('-gr', '--gtdb_release', help='GTDB release to use. [Current default is R220].', default='R220', required=False)
 	parser.add_argument('-o', '--output_dir', help='Output directory, which can then be provided as input for the\n"-tg" argument in fai.', required=True)
 	parser.add_argument('-l', '--locus_tag_length', type=int, help='Length of locus tags to set. Default is 3, allows for <~18k genomes.', required=False, default=3)
 	parser.add_argument('-r', '--rename_locus_tags', action='store_true', help='Whether to rename locus tags if provided for CDS features in GenBanks.', required=False, default=False)
@@ -156,6 +159,7 @@ def prepTG():
 	outdir = os.path.abspath(myargs.output_dir) + '/'
 	locus_tag_length = myargs.locus_tag_length
 	rename_locus_tags = myargs.rename_locus_tags
+	gtdb_release = myargs.gtdb_release
 	cpus = myargs.cpus
 	max_memory = myargs.max_memory
 	gene_calling_method = myargs.gene_calling_method
@@ -219,26 +223,32 @@ def prepTG():
 	logObject.info('Running prepTG version %s' % version)
 	sys.stdout.write('Running prepTG version %s\n' % version)
 
-
 	gtdb_genomes_dir = outdir + 'GTDB_Genomes/'
-	gtdb_genomes_listing_file = outdir + 'GTDB_Genomes_Listing.txt'
 	if gtdb_taxon != None:
 		# Step 0: Download GTDB listing file from lsaBGC git repo, parse GTDB information 
 		# file, get list of Genbank accessions, and perform dry-run with ncbi-genome-download 
 		# if requested.
-		sys.stdout.write("Using axel to download the GTDB R214 listing file from the lsaBGC git repo.\n")
-		gtdb_listing_file = outdir + "GTDB_R214_Information.txt.gz"	
-		wget_cmd = ['axel', 'https://github.com/Kalan-Lab/lsaBGC/raw/main/db/GTDB_R214_Information.txt.gz', '-o', gtdb_listing_file]
+		gtdb_release = gtdb_release.upper()
+		try:
+			assert(gtdb_release in valid_gtdb_releases)
+		except:
+			sys.stderr.write('GTDB release specified is not valid, options include: ' + ', '.join(valid_gtdb_releases))
+			sys.exit(1)
+
+		sys.stdout.write("Using axel to download the GTDB " + gtdb_release + " listing file.\n")
+		gtdb_listing_file = outdir + "GTDB_" + gtdb_release + "_Information.txt.gz"	
+		wget_cmd = ['axel', 'https://github.com/raufs/gtdb_gca_to_taxa_mappings/raw/main/GTDB_' + gtdb_release + '_Information.txt.gz', '-o', gtdb_listing_file]
+
 		try:
 			subprocess.call(' '.join(wget_cmd), shell=True, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL, executable='/bin/bash')
 			assert (os.path.isfile(gtdb_listing_file))
 		except:
-			sys.stderr.write('Had an issue running: %s\n' % ' '.join(cmd))
+			sys.stderr.write('Had an issue running: %s\n' % ' '.join(wget_cmd))
 			sys.stderr.write(traceback.format_exc())
 			sys.exit(1)
 
 		genbank_accession_listing_file = outdir + 'NCBI_Genbank_Accession_Listing.txt'
-		sys.stdout.write("--------------------\nStep 0\n--------------------\nBeginning by assessing which genomic assemblies are available for the taxa %s in GTDB and NCBI's Genbank db\n" % gtdb_taxon)
+		sys.stdout.write("--------------------\nStep 0\n--------------------\nBeginning by assessing which genomic assemblies are available for the taxa %s in GTDB %s and NCBI's Genbank db\n" % (gtdb_taxon, gtdb_release))
 
 		if not os.path.isfile(genbank_accession_listing_file):
 			genbank_accession_listing_handle = open(genbank_accession_listing_file, 'w')

diff --git a/bin/zol b/bin/zol
@@ -45,6 +45,7 @@ import shutil
 import subprocess
 import pickle
 from datetime import datetime
+import traceback
 
 def create_parser():
 	""" Parse arguments """
@@ -96,9 +97,12 @@ def create_parser():
 	parser.add_argument('-r', '--rename_lt', action='store_true', help="Rename locus-tags for CDS features in GenBanks.", required=False, default=False)
 	parser.add_argument('-d', '--dereplicate', action='store_true', help='Perform dereplication of input GenBanks using skani\nand single-linkage clustering or MCL.', required=False, default=False)
 	parser.add_argument('-ri', '--reinflate', action='store_true', help='Perform re-inflation with all gene-clusters of\northo-groups identified via dereplicated analysis.', required=False, default=False)
+	parser.add_argument('-cdo', '--cdhit_orthogroup', action='store_true', help='Cluster proteins using CD-HIT instead of using the\nstandard InParanoid-like ortholog group prediction approach.\nThis approach is faster and uses less memory, but is less accurate.', required=False, default=False)
+	parser.add_argument('-cdp', '--cdhit_params', help='Parameters for performing CD-HIT based ortholog group\nclustering if requested via --cdhit_orthogroup.\n[Default is "-c 0.5 -aL 0.25 -aS 0.5 -n 3 -M 4000"]', required=False, default="-c 0.5 -aL 0.25 -aS 0.5 -n 3 -M 4000")
 	parser.add_argument('-dt', '--derep_identity', type=float, help='skani ANI threshold to use for dereplication. [Default is 99.0].', required=False, default=99.0)
 	parser.add_argument('-dc', '--derep_coverage', type=float, help='skani aligned fraction threshold to use for\ndereplication. [Default is 95.0].', required=False, default=95.0)
 	parser.add_argument('-di', '--derep_inflation', type=float, help='Inflation parameter for MCL to use for dereplication of\ngene-clusters. If not specified single-linkage clustering\nwill be used instead.', required=False, default=None)
+	parser.add_argument('-rp', '--reinflate_params', help='Parameters for running CD-HIT for re-inflation, please\nsurround argument input with double quotes.\n[Default is "-c 0.98 -aL 0.95 -aS 0.95 -n 5 -M 4000"].', required=False, default="-c 0.98 -aL 0.95 -aS 0.95 -n 5 -M 4000")
 	parser.add_argument('-ibc', '--impute_broad_conservation', action='store_true', help='Impute weighted conservation stats based on cluster size associated\nwith dereplicated representatives.')
 	parser.add_argument('-ces', '--comprehensive_evo_stats', action='store_true', help='Allow computing of evolutionary statistics for non-single-copy genes.', required=False, default=False)
 	parser.add_argument('-aec', '--allow_edge_cds', action='store_true', help='Allow CDS within gene-cluster GenBanks with the attribute\n"near_scaffold_edge=True", which is set by fai for features\nwithin 2kb of contig edges.', required=False, default=False)
@@ -156,6 +160,7 @@ def zolMain():
 	ces_flag = myargs.comprehensive_evo_stats
 	dereplicate_flag = myargs.dereplicate
 	reinflate_flag = myargs.reinflate
+	reinflate_params = myargs.reinflate_params
 	derep_identity = myargs.derep_identity
 	derep_coverage = myargs.derep_coverage
 	derep_inflation = myargs.derep_inflation
@@ -165,6 +170,8 @@ def zolMain():
 	allow_edge_cds_flag = myargs.allow_edge_cds
 	refine_gene_calling_flag = myargs.refine_gene_calling
 	only_orthogroups_flag = myargs.only_orthogroups
+	cdhit_orthogroup_flag = myargs.cdhit_orthogroup
+	cdhit_orthogroup_params = myargs.cdhit_params
 	max_memory = myargs.max_memory
 
 	try:
@@ -206,12 +213,15 @@ def zolMain():
 
 	logObject.info("Saving parameters for future records.")
 	parameters_file = outdir + 'Parameter_Inputs.txt'
-	parameter_values = [input_dir, outdir, select_fai_params_mode, identity_threshold, coverage_threshold, evalue_threshold, use_super5, 
-					 	fubar_selection, skip_gard, focal_genbanks_listing_file, comparator_genbanks_listing_file, filter_lq_flag, 
-						filter_dq_flag, only_orthogroups_flag, ibc_flag, ces_flag, rename_lt_flag, allow_edge_cds_flag, dereplicate_flag, 
-						reinflate_flag, derep_identity, derep_coverage, derep_inflation, custom_database, refine_gene_calling_flag, 
-						length, width,full_genbank_labels, cpus, max_memory]
+	parameter_values = [input_dir, outdir, select_fai_params_mode, cdhit_orthogroup_flag, cdhit_orthogroup_params,
+					    identity_threshold, coverage_threshold, evalue_threshold, use_super5, fubar_selection, skip_gard, 
+						focal_genbanks_listing_file, comparator_genbanks_listing_file, filter_lq_flag, filter_dq_flag, 
+						only_orthogroups_flag, ibc_flag, ces_flag, rename_lt_flag, allow_edge_cds_flag, dereplicate_flag, 
+						reinflate_flag, reinflate_params, derep_identity, derep_coverage, derep_inflation, custom_database,
+						refine_gene_calling_flag, length, width,full_genbank_labels, cpus, max_memory]
 	parameter_names = ["Input directory with loci GenBanks", "Output directory", "Select fai parameters mode?", 
+					   "Perform iterative CD-HIT based orthogrouping instead of default InParanoid-based approach?",
+					   "CD-HIT parameters for orthogrouping (assuming approach requested)", 
 					   "Ortholog group finding identity threshold", "Ortholog group finding coverage threshold", 
 					   "Ortholog group finding E-value threshold", "Use super5 mode in MUSCLE alignments?", 
 					   "Run FUBAR selection analyses?", "Skip GARD partitioning by recombination breakpoints?", 
@@ -220,11 +230,11 @@ def zolMain():
 					   "Perform broad level estimation of ortholog group conservation if dereplication requested?",
 					   "Comprehensive reporting of evolutionary statistics, including for non-single copy ortholog groups",
 					   "Rename locus tags?", "Use CDS features with attribute near_scaffold_edge=True.",
-					   "Perform Dereplication?", "Perform reinflation?", "Dereplication identity threshold",
-					   "Dereplication coverage threshold", "Dereplication clustering method / MCL inflation",
-					   "Custom annotation database", "Refine gene calling using the custom annotation database",
-					   "Plot height", "Plot width", "Use full GenBank labels?", "Number of CPUs requested",
-					   "Maximum memory in GB"]
+					   "Perform Dereplication?", "Perform reinflation?", "Reinflation CD-HIT parameters", 
+					   "Dereplication identity threshold", "Dereplication coverage threshold", 
+					   "Dereplication clustering method / MCL inflation", "Custom annotation database", 
+					   "Refine gene calling using the custom annotation database", "Plot height", "Plot width", 
+					   "Use full GenBank labels?", "Number of CPUs requested", "Maximum memory in GB"]
 	util.logParametersToFile(parameters_file, parameter_names, parameter_values)
 	logObject.info("Done saving parameters!")
 
@@ -376,6 +386,9 @@ def zolMain():
 					sys.exit(1)
 			fo_cmd = ['findOrthologs.py', '-p', fo_prot_dir, '-o', og_dir, '-e', str(evalue_threshold), '-i',
 					  str(identity_threshold), '-q', str(coverage_threshold), '-c', str(cpus)]
+			if cdhit_orthogroup_flag:
+				fo_cmd = ['findOrthologs.py', '-p', fo_prot_dir, '-o', og_dir, '-cd', 
+			  			  '-cdp="' + cdhit_orthogroup_params + '"', '-c', str(cpus)]
 			try:
 				subprocess.call(' '.join(fo_cmd), shell=True, stdout=subprocess.DEVNULL,
 								stderr=subprocess.DEVNULL, executable='/bin/bash')
@@ -392,6 +405,7 @@ def zolMain():
 		except Exception as e:
 			sys.stderr.write('Issues with determining ortholog/ortholog groups!\nThis is likely because no core ortholog groups were identified, please consider filtering\nlow gene-cluster instances or adjusting clustering parameters!\n')
 			logObject.error('Issues with determining ortholog/ortholog groups!')
+			sys.stderr.write(traceback.format_exc() + '\n')
 			sys.stderr.write(str(e) + '\n')
 			sys.exit(1)
 		os.system('touch %s' % step2_check_file)