commit v1.3.19

Kalan-Lab · Mar 17, 2024 · 4515225 · 4515225
1 parent e28079e
commit 4515225
Show file tree

Hide file tree

Showing 10 changed files with 480 additions and 76 deletions.
diff --git a/README.md b/README.md
@@ -49,7 +49,7 @@ In addition, please cite important [dependency software or databases](https://gi
 
 ### Zoom on Locus (zol)
 
-**`zol`** is a program to create table reports showing ortholog group conservation, annotation, and evolutionary stats for any gene-cluster or locus of interest. At it's core it performs ortholog group inference de novo across gene-cluster instances similar to [CORASON](https://github.com/nselem/corason), but uses an InParanoid-like algorithm. Tables are similar but currently more in-depth and feature some different statistics than lsaBGC-PopGene reports. zol produces a basic heatmap, but for visualizations of gene-clusters we recommend other tools such as [clinker](https://github.com/gamcil/clinker), [CORASON](https://github.com/nselem/corason), and [gggenomes](https://github.com/thackl/gggenomes), which we think the in-depth spreadsheet complements nicely. We also provide examples of how zol and skani can be used to select representative gene clusters for such visual investigations. 
+**`zol`** is a program to create table reports showing ortholog group conservation, annotation, and evolutionary stats for any gene-cluster or locus of interest. At it's core it performs ortholog group inference de novo across gene-cluster instances similar to [CORASON](https://github.com/nselem/corason), but uses an InParanoid-like algorithm. Tables are similar but currently more in-depth and feature some different statistics than lsaBGC-PopGene reports. zol produces a basic heatmap, but for visualizations of gene-clusters we recommend other tools such as [clinker](https://github.com/gamcil/clinker), [pyGenomeViz](https://github.com/moshi4/pyGenomeViz), [CORASON](https://github.com/nselem/corason), and [gggenomes](https://github.com/thackl/gggenomes), which we think the in-depth spreadsheet complements nicely. We also provide examples of how zol and skani can be used to select representative gene clusters for such visual investigations. 
 
 Critically, ***with the development of some key options, together, fai and zol enable high-throughput detection of orthologs across multi-species datasets comprising of thousands of genomes.***
 

diff --git a/bin/prepTG b/bin/prepTG
@@ -162,6 +162,9 @@ def prepTG():
 	meta_mode = myargs.meta_mode
 	create_species_tree = myargs.create_species_tree
 	reference_proteome = myargs.reference_proteome
+	if reference_proteome != None:
+		reference_proteome = os.path.abspath(myargs.reference_proteome)
+
 
 	if os.path.isdir(outdir):
 		sys.stderr.write("Output directory exists! Exiting\n ")
@@ -321,8 +324,8 @@ def prepTG():
 			assert(os.path.isdir(target_genomes_dir))
 			target_genomes_dir = os.path.abspath(target_genomes_dir) + '/'
 		except:
- 			sys.stderr.write('Issue with validating directory with target genomes exists.\n')
- 			sys.exit(1)
+			sys.stderr.write('Issue with validating directory with target genomes exists.\n')
+			sys.exit(1)
 
 
 	# set max memory limit
@@ -503,4 +506,4 @@ def prepTG():
 	sys.exit(0)
 
 if __name__ == '__main__':
-	prepTG()
+	prepTG()
diff --git a/bin/zol b/bin/zol
@@ -87,6 +87,7 @@ def create_parser():
 
 	parser.add_argument('-i', '--input_dir', help='Directory with orthologous/homologous locus-specific GenBanks.\nFiles must end with ".gbk", ".gbff", or ".genbank".', required=False, default=None)
 	parser.add_argument('-o', '--output_dir', help='Parent output/workspace directory.', required=True)
+	parser.add_argument('-sfp', '--select_fai_params_mode', action='store_true', help='Determine statistics informative for selecting parameters for running\nfai to find more instances of the gene cluster.', required=False, default=False)
 	parser.add_argument('-it', '--identity_threshold', type=float, help='Minimum identity coverage for an alignment between protein\npairs from two gene-clusters to consider in search for\northologs. [Default is 30].', required=False, default=30.0)
 	parser.add_argument('-ct', '--coverage_threshold', type=float, help='Minimum query coverage for an alignment between protein\npairs from two gene-clusters to consider in search for\northologs. [Default is 50].', required=False, default=50.0)
 	parser.add_argument('-et', '--evalue_threshold', type=float, help='Maximum E-value for an alignment between protein pairs from\ntwo gene-clusters to consider in search for orthologs.\n[Default is 0.001].', required=False, default=0.001)
@@ -111,6 +112,7 @@ def create_parser():
 	parser.add_argument('-fgl', '--full_genbank_labels', action='store_true', help='Use full GenBank labels instead of just the first 20 characters.', required=False, default=False)
 	parser.add_argument('-f', '--focal_genbanks', help='File with focal genbank(s) listed (one per line).', required=False, default=None)
 	parser.add_argument('-fc', '--comparator_genbanks', help='Optional file with comparator genbank(s) listed.\nDefault is to use remaining GenBanks as comparators to focal listing.', required=False, default=None)
+	parser.add_argument('-oo', '--only_orthogroups', action='store_true', help="Only compute ortholog groups and stop (runs up to step 2).", required=False, default=False)
 	parser.add_argument('-c', '--cpus', type=int, help="Number of cpus/threads to use.", required=False, default=1)
 	parser.add_argument('-mm', '--max_memory', type=int, help='Uses resource module to set soft memory limit. Provide in Giga-bytes.\nGenerally memory shouldn\'t be a major concern unless working\nwith hundreds of large metagenomes. [currently\nexperimental; default is None].', default=None, required=False)
 	parser.add_argument('-v', '--version', action='store_true', help="Get version and exit.", required=False, default=False)
@@ -136,6 +138,7 @@ def zolMain():
 
 	input_dir = os.path.abspath(myargs.input_dir) + '/'
 	outdir = os.path.abspath(myargs.output_dir) + '/'
+	select_fai_params_mode = myargs.select_fai_params_mode
 	cpus = myargs.cpus
 	use_super5 = myargs.use_super5
 	fubar_selection = myargs.selection_analysis
@@ -161,6 +164,7 @@ def zolMain():
 	custom_database = myargs.custom_database
 	allow_edge_cds_flag = myargs.allow_edge_cds
 	refine_gene_calling_flag = myargs.refine_gene_calling
+	only_orthogroups_flag = myargs.only_orthogroups
 	max_memory = myargs.max_memory
 
 	try:
@@ -182,6 +186,9 @@ def zolMain():
 	if not os.path.isdir(check_dir):
 		util.setupReadyDirectory([check_dir])
 
+	if select_fai_params_mode:
+		allow_edge_cds_flag = True
+
 	"""
 	START WORKFLOW
 	"""
@@ -199,16 +206,17 @@ def zolMain():
 
 	logObject.info("Saving parameters for future records.")
 	parameters_file = outdir + 'Parameter_Inputs.txt'
-	parameter_values = [input_dir, outdir,  use_super5, fubar_selection, skip_gard,
-					    focal_genbanks_listing_file, comparator_genbanks_listing_file, filter_lq_flag, filter_dq_flag,
-						ibc_flag, ces_flag, rename_lt_flag, allow_edge_cds_flag, dereplicate_flag, reinflate_flag, derep_identity,
-						derep_coverage, derep_inflation, custom_database, refine_gene_calling_flag, length, width,
-						full_genbank_labels, cpus, max_memory]
-	parameter_names = ["Input directory with loci GenBanks", "Output directory",
-					   "Use super5 mode in MUSCLE alignments?", "Run FUBAR selection analyses?",
-					   "Skip GARD partitioning by recombination breakpoints?", "Focal GenBanks listing",
-					   "Comparator GenBanks listing", "Filter low quality gene clusters?",
-					   "Filter draft/incomplete gene clusters?",
+	parameter_values = [input_dir, outdir, select_fai_params_mode, identity_threshold, coverage_threshold, evalue_threshold, use_super5, 
+					 	fubar_selection, skip_gard, focal_genbanks_listing_file, comparator_genbanks_listing_file, filter_lq_flag, 
+						filter_dq_flag, only_orthogroups_flag, ibc_flag, ces_flag, rename_lt_flag, allow_edge_cds_flag, dereplicate_flag, 
+						reinflate_flag, derep_identity, derep_coverage, derep_inflation, custom_database, refine_gene_calling_flag, 
+						length, width,full_genbank_labels, cpus, max_memory]
+	parameter_names = ["Input directory with loci GenBanks", "Output directory", "Select fai parameters mode?", 
+					   "Ortholog group finding identity threshold", "Ortholog group finding coverage threshold", 
+					   "Ortholog group finding E-value threshold", "Use super5 mode in MUSCLE alignments?", 
+					   "Run FUBAR selection analyses?", "Skip GARD partitioning by recombination breakpoints?", 
+					   "Focal GenBanks listing", "Comparator GenBanks listing", "Filter low quality gene clusters?",
+					   "Filter draft/incomplete gene clusters?", "Only compute orthologs and stop?",
 					   "Perform broad level estimation of ortholog group conservation if dereplication requested?",
 					   "Comprehensive reporting of evolutionary statistics, including for non-single copy ortholog groups",
 					   "Rename locus tags?", "Use CDS features with attribute near_scaffold_edge=True.",
@@ -396,7 +404,24 @@ def zolMain():
 		genbanks = drep_genbanks
 		assert(os.path.isfile(ortho_matrix_file))
 
-	# Step 3: Create Alignments, Phylogenies and Consensus Sequences
+	# If fai param selection mode, do that and exit:
+	if select_fai_params_mode:
+		logObject.info('--------------------\nAssessing parameters to recommend for running fai to detect additional instances.')
+		sys.stdout.write('--------------------\nAssessing parameters to recommend for running fai to detect additional instances.\n')
+		proc_dir = outdir + 'Ortholog_Group_Processing/'
+		hg_prot_dir = proc_dir + 'OG_Protein_Sequences/'
+		hg_nucl_dir = proc_dir + 'OG_Nucleotide_Sequences/'
+		util.setupReadyDirectory([proc_dir, hg_prot_dir, hg_nucl_dir])
+		zol.partitionSequencesByHomologGroups(ortho_matrix_file, prot_dir, nucl_dir, hg_prot_dir, hg_nucl_dir, logObject)
+		util.determineFaiParamRecommendataions(genbanks, ortho_matrix_file, hg_prot_dir, outdir, logObject, cpus=cpus)
+		sys.exit(0)
+
+	if only_orthogroups_flag:
+		logObject.info('--------------------\nRequested mode was to only compute ortholog groups across gene clusters\n.The ortholog-group by gene-cluster matrix can be found at: %s\n' % ortho_matrix_file)
+		sys.stdout.write('--------------------\nRequested mode was to only compute ortholog groups across gene clusters.\nThe ortholog-group by gene-cluster matrix can be found at: %s\n' % ortho_matrix_file)
+		sys.exit(0)
+
+	# Step 3 (normal): Create Alignments, Phylogenies and Consensus Sequences
 	logObject.info('--------------------\nStep 3\n--------------------\nCreating alignments, trees and consensus sequences for ortholog groups')
 	sys.stdout.write('--------------------\nStep 3\n--------------------\nCreating alignments, trees and consensus sequences for ortholog groups\n')
 	step3_check_file = check_dir + 'step3.txt'

diff --git a/scripts/convertMiniprotGffToGbkAndProt.py b/scripts/convertMiniprotGffToGbkAndProt.py
@@ -108,6 +108,7 @@ def convertMiniProtGFFtoGenbank():
 	# Step 1: Parse GFF3 for transcript coordinates
 	query_mrna_coords = defaultdict(list)
 	scaffold_queries = defaultdict(list)
+	map_counter = 0
 	with open(miniprot_gff3) as omg:
 		for line in omg:
 			line = line.strip()
@@ -125,8 +126,9 @@ def convertMiniProtGFFtoGenbank():
 			if ls[6] == '-':
 				dire = -1
 			# in case there are paralogs
-			query_mrna_coords[tuple([query, start_coord])] = [scaffold, start_coord, end_coord, dire, score]
-			scaffold_queries[scaffold].append([query, start_coord])
+			query_mrna_coords[tuple([query, map_counter])] = [scaffold, start_coord, end_coord, dire, score]
+			scaffold_queries[scaffold].append([query, start_coord, map_counter])
+			map_counter += 1
 
 	# Step 2: Resolve overlap between mRNA transcripts
 	redundant = set([])
@@ -147,6 +149,7 @@ def convertMiniProtGFFtoGenbank():
 
 	# Step 3: Parse GFF3 for CDS coordinates
 	query_cds_coords = defaultdict(list)
+	query_cds_coords_accounted = defaultdict(set)
 	with open(miniprot_gff3) as omg:
 		for line in omg:
 			line = line.strip()
@@ -160,7 +163,29 @@ def convertMiniProtGFFtoGenbank():
 			dire = 1
 			if ls[6] == '-':
 				dire = -1
-			query_cds_coords[query].append([scaffold, start_coord, end_coord, dire])
+			score = float(ls[5])
+			curr_coord = tuple([scaffold, start_coord, end_coord, dire, score])	
+			if not curr_coord in query_cds_coords_accounted[query]:
+				query_cds_coords[query].append([scaffold, start_coord, end_coord, dire, score])
+				query_cds_coords_accounted[query].add(curr_coord)
+
+	# handle overlapping exons
+	cds_redundant = defaultdict(set)
+	for query in sorted(query_cds_coords):
+		for i, cdsc1 in enumerate(sorted(query_cds_coords[query], key=itemgetter(1))):
+			c1_coords = set(range(cdsc1[1], cdsc1[2]))
+			c1_score = cdsc1[4]
+			for j, cdsc2 in enumerate(sorted(query_cds_coords[query], key=itemgetter(1))):
+				if i >= j: continue
+				# check scaffolds are the same
+				if cdsc1[0] != cdsc2[0]: continue
+				c2_coords = set(range(cdsc2[1], cdsc2[2]))
+				if len(c1_coords.intersection(c2_coords)) > 0:
+					c2_score = cdsc2[4]
+					if c1_score >= c2_score:
+						cds_redundant[query].add(tuple(cdsc2))
+					else:
+						cds_redundant[query].add(tuple(cdsc1))
 
 	# Step 4: Go through FASTA scaffold/contig by scaffold/contig and create output GenBank
 	gbk_handle = open(output_genbank, 'w')
@@ -174,15 +199,17 @@ def convertMiniProtGFFtoGenbank():
 			gbk_rec.annotations['molecule_type'] = 'DNA'
 			feature_list = []
 			for mrna in sorted(scaffold_queries[rec.id], key=itemgetter(1)):
-				qid, start = mrna
-				key = tuple([qid, start])
-				if key in redundant: continue
-				mrna_info = query_mrna_coords[key]
+				qid, start, map_counter = mrna
+				key_for_redundancy_check = tuple([qid, map_counter])
+				key_for_paf = tuple([qid, start])
+				if key_for_redundancy_check in redundant: continue
+				mrna_info = query_mrna_coords[key_for_redundancy_check]
 				mrna_coords = set(range(mrna_info[1], mrna_info[2]))
 
 				mrna_exon_locs = []
 				all_coords = []
 				for cds_info in query_cds_coords[qid]:
+					if tuple(cds_info) in cds_redundant[qid]: continue
 					cds_coords = set(range(cds_info[1], cds_info[2]))
 					if len(mrna_coords.intersection(cds_coords)) > 0 and cds_info[0] == mrna_info[0]:
 						all_coords.append([cds_info[1], cds_info[2]])
@@ -220,7 +247,7 @@ def convertMiniProtGFFtoGenbank():
 				assert(prot_seq != None)
 				"""
 
-				scaffold, paf_start_coord, paf_end_coord, paf_cigar_parsed, paf_string = query_mrna_paf_info[key]
+				scaffold, paf_start_coord, paf_end_coord, paf_cigar_parsed, paf_string = query_mrna_paf_info[key_for_paf]
 
 				paf_nucl_seq = ''
 				paf_coord = paf_start_coord
@@ -264,6 +291,7 @@ def convertMiniProtGFFtoGenbank():
 				"""
 
 				faa_handle.write('>' + locus_tag + '_' + lt + '\n' + paf_prot_seq + '\n')
+				feature.qualifiers['protein_from_reference'] = qid
 				feature.qualifiers['locus_tag'] = locus_tag + '_' + lt
 				feature.qualifiers['translation'] = paf_prot_seq
 				feature.qualifiers['open_reading_frame'] = paf_nucl_seq

diff --git a/scripts/extractProteinsFromGenBank.py b/scripts/extractProteinsFromGenBank.py
@@ -0,0 +1,42 @@
+import os
+import sys
+from Bio import SeqIO
+import argparse
+
+def create_parser():
+	""" Parse arguments """
+	parser = argparse.ArgumentParser(description="""
+	Program: extractProteinsFromGenBank.py
+	Author: Rauf Salamzade
+	Affiliation: Kalan Lab, UW Madison, Department of Medical Microbiology and Immunology
+    
+    A simple script 
+	""", formatter_class=argparse.RawTextHelpFormatter)
+
+	parser.add_argument('-i', '--gene_cluster_genbank', help='Path to gene cluster genbank.', required=True)
+	args = parser.parse_args()
+	return args
+
+myargs = create_parser()
+gbk_file = myargs.gene_cluster_genbank
+
+counter = 0
+with open(gbk_file) as ogbf:
+    for rec in SeqIO.parse(ogbf, 'genbank'):
+        for feat in rec.features:
+            if not feat.type == 'CDS': continue
+            protein_id = None
+            try:
+                protein_id = feat.qualifiers.get('locus_tag')[0]
+            except:
+                pass
+
+            if protein_id == None:
+                try:
+                    protein_id = feat.qualifiers.get('protein_id')[0]
+                except:
+                    pass
+            if protein_id == None:
+                protein_id = 'CDS_' + str(counter)
+                counter += 1
+            print('>' + protein_id + '\n' + str(feat.qualifiers.get('translation')[0]))
diff --git a/scripts/generateSyntenicVisual.py b/scripts/generateSyntenicVisual.py
@@ -92,9 +92,9 @@ def genSynVis():
 	pif_handle = open(plot_input_file, 'w')
 	pif_handle.write('\t'.join(['OG', 'Start', 'End', 'Direction', 'SC', 'Metric']) + '\n')
 	for index, row in df.iterrows():
-		og = row['ortholog group (OG) ID']
+		og = row['Ortholog Group (OG) ID']
 		og_cons = row['Proportion of Total Gene Clusters with OG']
-		if float(og_cons) < 0.25: continue
+		#if float(og_cons) < 0.25: continue
 		og_mlen = float(row['OG Median Length (bp)'])
 		og_dir = row['OG Consensus Direction']
 		sc_flag = row['OG is Single Copy?']

diff --git a/setup.py b/setup.py
@@ -2,7 +2,7 @@
 import os
 
 setup(name='zol',
-      version='1.3.18',
+      version='1.3.19',
       description='',
       url='http://github.com/Kalan-Lab/zol/',
       author='Rauf Salamzade',

diff --git a/zol/orthologs/findOrthologs.py b/zol/orthologs/findOrthologs.py
@@ -109,7 +109,7 @@ def refactorProteomes(inputs):
 def oneVsAllParse(inputs):
 	sample, samp_algn_res, samp_forth_res, identity_cutoff, coverage_cutoff, logObject = inputs
 
-	rbh_cmd = [rbh_prog, samp_algn_res, str(identity_cutoff), str(coverage_cutoff), '>', samp_forth_res]
+	rbh_cmd = [rbh_prog, samp_algn_res, str(identity_cutoff), str(coverage_cutoff), sample, '>', samp_forth_res]
 	try:
 		subprocess.call(' '.join(rbh_cmd), shell=True, stdout=subprocess.DEVNULL,
 						stderr=subprocess.DEVNULL, executable='/bin/bash')