Skip to content

Commit

Permalink
commit v1.3.19
Browse files Browse the repository at this point in the history
  • Loading branch information
raufs committed Mar 17, 2024
1 parent e28079e commit 4515225
Show file tree
Hide file tree
Showing 10 changed files with 480 additions and 76 deletions.
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,7 @@ In addition, please cite important [dependency software or databases](https://gi

### Zoom on Locus (zol)

**`zol`** is a program to create table reports showing ortholog group conservation, annotation, and evolutionary stats for any gene-cluster or locus of interest. At it's core it performs ortholog group inference de novo across gene-cluster instances similar to [CORASON](https://github.com/nselem/corason), but uses an InParanoid-like algorithm. Tables are similar but currently more in-depth and feature some different statistics than lsaBGC-PopGene reports. zol produces a basic heatmap, but for visualizations of gene-clusters we recommend other tools such as [clinker](https://github.com/gamcil/clinker), [CORASON](https://github.com/nselem/corason), and [gggenomes](https://github.com/thackl/gggenomes), which we think the in-depth spreadsheet complements nicely. We also provide examples of how zol and skani can be used to select representative gene clusters for such visual investigations.
**`zol`** is a program to create table reports showing ortholog group conservation, annotation, and evolutionary stats for any gene-cluster or locus of interest. At it's core it performs ortholog group inference de novo across gene-cluster instances similar to [CORASON](https://github.com/nselem/corason), but uses an InParanoid-like algorithm. Tables are similar but currently more in-depth and feature some different statistics than lsaBGC-PopGene reports. zol produces a basic heatmap, but for visualizations of gene-clusters we recommend other tools such as [clinker](https://github.com/gamcil/clinker), [pyGenomeViz](https://github.com/moshi4/pyGenomeViz), [CORASON](https://github.com/nselem/corason), and [gggenomes](https://github.com/thackl/gggenomes), which we think the in-depth spreadsheet complements nicely. We also provide examples of how zol and skani can be used to select representative gene clusters for such visual investigations.

Critically, ***with the development of some key options, together, fai and zol enable high-throughput detection of orthologs across multi-species datasets comprising of thousands of genomes.***

Expand Down
9 changes: 6 additions & 3 deletions bin/prepTG
Original file line number Diff line number Diff line change
Expand Up @@ -162,6 +162,9 @@ def prepTG():
meta_mode = myargs.meta_mode
create_species_tree = myargs.create_species_tree
reference_proteome = myargs.reference_proteome
if reference_proteome != None:
reference_proteome = os.path.abspath(myargs.reference_proteome)


if os.path.isdir(outdir):
sys.stderr.write("Output directory exists! Exiting\n ")
Expand Down Expand Up @@ -321,8 +324,8 @@ def prepTG():
assert(os.path.isdir(target_genomes_dir))
target_genomes_dir = os.path.abspath(target_genomes_dir) + '/'
except:
sys.stderr.write('Issue with validating directory with target genomes exists.\n')
sys.exit(1)
sys.stderr.write('Issue with validating directory with target genomes exists.\n')
sys.exit(1)


# set max memory limit
Expand Down Expand Up @@ -503,4 +506,4 @@ def prepTG():
sys.exit(0)

if __name__ == '__main__':
prepTG()
prepTG()
47 changes: 36 additions & 11 deletions bin/zol
Original file line number Diff line number Diff line change
Expand Up @@ -87,6 +87,7 @@ def create_parser():

parser.add_argument('-i', '--input_dir', help='Directory with orthologous/homologous locus-specific GenBanks.\nFiles must end with ".gbk", ".gbff", or ".genbank".', required=False, default=None)
parser.add_argument('-o', '--output_dir', help='Parent output/workspace directory.', required=True)
parser.add_argument('-sfp', '--select_fai_params_mode', action='store_true', help='Determine statistics informative for selecting parameters for running\nfai to find more instances of the gene cluster.', required=False, default=False)
parser.add_argument('-it', '--identity_threshold', type=float, help='Minimum identity coverage for an alignment between protein\npairs from two gene-clusters to consider in search for\northologs. [Default is 30].', required=False, default=30.0)
parser.add_argument('-ct', '--coverage_threshold', type=float, help='Minimum query coverage for an alignment between protein\npairs from two gene-clusters to consider in search for\northologs. [Default is 50].', required=False, default=50.0)
parser.add_argument('-et', '--evalue_threshold', type=float, help='Maximum E-value for an alignment between protein pairs from\ntwo gene-clusters to consider in search for orthologs.\n[Default is 0.001].', required=False, default=0.001)
Expand All @@ -111,6 +112,7 @@ def create_parser():
parser.add_argument('-fgl', '--full_genbank_labels', action='store_true', help='Use full GenBank labels instead of just the first 20 characters.', required=False, default=False)
parser.add_argument('-f', '--focal_genbanks', help='File with focal genbank(s) listed (one per line).', required=False, default=None)
parser.add_argument('-fc', '--comparator_genbanks', help='Optional file with comparator genbank(s) listed.\nDefault is to use remaining GenBanks as comparators to focal listing.', required=False, default=None)
parser.add_argument('-oo', '--only_orthogroups', action='store_true', help="Only compute ortholog groups and stop (runs up to step 2).", required=False, default=False)
parser.add_argument('-c', '--cpus', type=int, help="Number of cpus/threads to use.", required=False, default=1)
parser.add_argument('-mm', '--max_memory', type=int, help='Uses resource module to set soft memory limit. Provide in Giga-bytes.\nGenerally memory shouldn\'t be a major concern unless working\nwith hundreds of large metagenomes. [currently\nexperimental; default is None].', default=None, required=False)
parser.add_argument('-v', '--version', action='store_true', help="Get version and exit.", required=False, default=False)
Expand All @@ -136,6 +138,7 @@ def zolMain():

input_dir = os.path.abspath(myargs.input_dir) + '/'
outdir = os.path.abspath(myargs.output_dir) + '/'
select_fai_params_mode = myargs.select_fai_params_mode
cpus = myargs.cpus
use_super5 = myargs.use_super5
fubar_selection = myargs.selection_analysis
Expand All @@ -161,6 +164,7 @@ def zolMain():
custom_database = myargs.custom_database
allow_edge_cds_flag = myargs.allow_edge_cds
refine_gene_calling_flag = myargs.refine_gene_calling
only_orthogroups_flag = myargs.only_orthogroups
max_memory = myargs.max_memory

try:
Expand All @@ -182,6 +186,9 @@ def zolMain():
if not os.path.isdir(check_dir):
util.setupReadyDirectory([check_dir])

if select_fai_params_mode:
allow_edge_cds_flag = True

"""
START WORKFLOW
"""
Expand All @@ -199,16 +206,17 @@ def zolMain():

logObject.info("Saving parameters for future records.")
parameters_file = outdir + 'Parameter_Inputs.txt'
parameter_values = [input_dir, outdir, use_super5, fubar_selection, skip_gard,
focal_genbanks_listing_file, comparator_genbanks_listing_file, filter_lq_flag, filter_dq_flag,
ibc_flag, ces_flag, rename_lt_flag, allow_edge_cds_flag, dereplicate_flag, reinflate_flag, derep_identity,
derep_coverage, derep_inflation, custom_database, refine_gene_calling_flag, length, width,
full_genbank_labels, cpus, max_memory]
parameter_names = ["Input directory with loci GenBanks", "Output directory",
"Use super5 mode in MUSCLE alignments?", "Run FUBAR selection analyses?",
"Skip GARD partitioning by recombination breakpoints?", "Focal GenBanks listing",
"Comparator GenBanks listing", "Filter low quality gene clusters?",
"Filter draft/incomplete gene clusters?",
parameter_values = [input_dir, outdir, select_fai_params_mode, identity_threshold, coverage_threshold, evalue_threshold, use_super5,
fubar_selection, skip_gard, focal_genbanks_listing_file, comparator_genbanks_listing_file, filter_lq_flag,
filter_dq_flag, only_orthogroups_flag, ibc_flag, ces_flag, rename_lt_flag, allow_edge_cds_flag, dereplicate_flag,
reinflate_flag, derep_identity, derep_coverage, derep_inflation, custom_database, refine_gene_calling_flag,
length, width,full_genbank_labels, cpus, max_memory]
parameter_names = ["Input directory with loci GenBanks", "Output directory", "Select fai parameters mode?",
"Ortholog group finding identity threshold", "Ortholog group finding coverage threshold",
"Ortholog group finding E-value threshold", "Use super5 mode in MUSCLE alignments?",
"Run FUBAR selection analyses?", "Skip GARD partitioning by recombination breakpoints?",
"Focal GenBanks listing", "Comparator GenBanks listing", "Filter low quality gene clusters?",
"Filter draft/incomplete gene clusters?", "Only compute orthologs and stop?",
"Perform broad level estimation of ortholog group conservation if dereplication requested?",
"Comprehensive reporting of evolutionary statistics, including for non-single copy ortholog groups",
"Rename locus tags?", "Use CDS features with attribute near_scaffold_edge=True.",
Expand Down Expand Up @@ -396,7 +404,24 @@ def zolMain():
genbanks = drep_genbanks
assert(os.path.isfile(ortho_matrix_file))

# Step 3: Create Alignments, Phylogenies and Consensus Sequences
# If fai param selection mode, do that and exit:
if select_fai_params_mode:
logObject.info('--------------------\nAssessing parameters to recommend for running fai to detect additional instances.')
sys.stdout.write('--------------------\nAssessing parameters to recommend for running fai to detect additional instances.\n')
proc_dir = outdir + 'Ortholog_Group_Processing/'
hg_prot_dir = proc_dir + 'OG_Protein_Sequences/'
hg_nucl_dir = proc_dir + 'OG_Nucleotide_Sequences/'
util.setupReadyDirectory([proc_dir, hg_prot_dir, hg_nucl_dir])
zol.partitionSequencesByHomologGroups(ortho_matrix_file, prot_dir, nucl_dir, hg_prot_dir, hg_nucl_dir, logObject)
util.determineFaiParamRecommendataions(genbanks, ortho_matrix_file, hg_prot_dir, outdir, logObject, cpus=cpus)
sys.exit(0)

if only_orthogroups_flag:
logObject.info('--------------------\nRequested mode was to only compute ortholog groups across gene clusters\n.The ortholog-group by gene-cluster matrix can be found at: %s\n' % ortho_matrix_file)
sys.stdout.write('--------------------\nRequested mode was to only compute ortholog groups across gene clusters.\nThe ortholog-group by gene-cluster matrix can be found at: %s\n' % ortho_matrix_file)
sys.exit(0)

# Step 3 (normal): Create Alignments, Phylogenies and Consensus Sequences
logObject.info('--------------------\nStep 3\n--------------------\nCreating alignments, trees and consensus sequences for ortholog groups')
sys.stdout.write('--------------------\nStep 3\n--------------------\nCreating alignments, trees and consensus sequences for ortholog groups\n')
step3_check_file = check_dir + 'step3.txt'
Expand Down
44 changes: 36 additions & 8 deletions scripts/convertMiniprotGffToGbkAndProt.py
Original file line number Diff line number Diff line change
Expand Up @@ -108,6 +108,7 @@ def convertMiniProtGFFtoGenbank():
# Step 1: Parse GFF3 for transcript coordinates
query_mrna_coords = defaultdict(list)
scaffold_queries = defaultdict(list)
map_counter = 0
with open(miniprot_gff3) as omg:
for line in omg:
line = line.strip()
Expand All @@ -125,8 +126,9 @@ def convertMiniProtGFFtoGenbank():
if ls[6] == '-':
dire = -1
# in case there are paralogs
query_mrna_coords[tuple([query, start_coord])] = [scaffold, start_coord, end_coord, dire, score]
scaffold_queries[scaffold].append([query, start_coord])
query_mrna_coords[tuple([query, map_counter])] = [scaffold, start_coord, end_coord, dire, score]
scaffold_queries[scaffold].append([query, start_coord, map_counter])
map_counter += 1

# Step 2: Resolve overlap between mRNA transcripts
redundant = set([])
Expand All @@ -147,6 +149,7 @@ def convertMiniProtGFFtoGenbank():

# Step 3: Parse GFF3 for CDS coordinates
query_cds_coords = defaultdict(list)
query_cds_coords_accounted = defaultdict(set)
with open(miniprot_gff3) as omg:
for line in omg:
line = line.strip()
Expand All @@ -160,7 +163,29 @@ def convertMiniProtGFFtoGenbank():
dire = 1
if ls[6] == '-':
dire = -1
query_cds_coords[query].append([scaffold, start_coord, end_coord, dire])
score = float(ls[5])
curr_coord = tuple([scaffold, start_coord, end_coord, dire, score])
if not curr_coord in query_cds_coords_accounted[query]:
query_cds_coords[query].append([scaffold, start_coord, end_coord, dire, score])
query_cds_coords_accounted[query].add(curr_coord)

# handle overlapping exons
cds_redundant = defaultdict(set)
for query in sorted(query_cds_coords):
for i, cdsc1 in enumerate(sorted(query_cds_coords[query], key=itemgetter(1))):
c1_coords = set(range(cdsc1[1], cdsc1[2]))
c1_score = cdsc1[4]
for j, cdsc2 in enumerate(sorted(query_cds_coords[query], key=itemgetter(1))):
if i >= j: continue
# check scaffolds are the same
if cdsc1[0] != cdsc2[0]: continue
c2_coords = set(range(cdsc2[1], cdsc2[2]))
if len(c1_coords.intersection(c2_coords)) > 0:
c2_score = cdsc2[4]
if c1_score >= c2_score:
cds_redundant[query].add(tuple(cdsc2))
else:
cds_redundant[query].add(tuple(cdsc1))

# Step 4: Go through FASTA scaffold/contig by scaffold/contig and create output GenBank
gbk_handle = open(output_genbank, 'w')
Expand All @@ -174,15 +199,17 @@ def convertMiniProtGFFtoGenbank():
gbk_rec.annotations['molecule_type'] = 'DNA'
feature_list = []
for mrna in sorted(scaffold_queries[rec.id], key=itemgetter(1)):
qid, start = mrna
key = tuple([qid, start])
if key in redundant: continue
mrna_info = query_mrna_coords[key]
qid, start, map_counter = mrna
key_for_redundancy_check = tuple([qid, map_counter])
key_for_paf = tuple([qid, start])
if key_for_redundancy_check in redundant: continue
mrna_info = query_mrna_coords[key_for_redundancy_check]
mrna_coords = set(range(mrna_info[1], mrna_info[2]))

mrna_exon_locs = []
all_coords = []
for cds_info in query_cds_coords[qid]:
if tuple(cds_info) in cds_redundant[qid]: continue
cds_coords = set(range(cds_info[1], cds_info[2]))
if len(mrna_coords.intersection(cds_coords)) > 0 and cds_info[0] == mrna_info[0]:
all_coords.append([cds_info[1], cds_info[2]])
Expand Down Expand Up @@ -220,7 +247,7 @@ def convertMiniProtGFFtoGenbank():
assert(prot_seq != None)
"""

scaffold, paf_start_coord, paf_end_coord, paf_cigar_parsed, paf_string = query_mrna_paf_info[key]
scaffold, paf_start_coord, paf_end_coord, paf_cigar_parsed, paf_string = query_mrna_paf_info[key_for_paf]

paf_nucl_seq = ''
paf_coord = paf_start_coord
Expand Down Expand Up @@ -264,6 +291,7 @@ def convertMiniProtGFFtoGenbank():
"""

faa_handle.write('>' + locus_tag + '_' + lt + '\n' + paf_prot_seq + '\n')
feature.qualifiers['protein_from_reference'] = qid
feature.qualifiers['locus_tag'] = locus_tag + '_' + lt
feature.qualifiers['translation'] = paf_prot_seq
feature.qualifiers['open_reading_frame'] = paf_nucl_seq
Expand Down
42 changes: 42 additions & 0 deletions scripts/extractProteinsFromGenBank.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
import os
import sys
from Bio import SeqIO
import argparse

def create_parser():
""" Parse arguments """
parser = argparse.ArgumentParser(description="""
Program: extractProteinsFromGenBank.py
Author: Rauf Salamzade
Affiliation: Kalan Lab, UW Madison, Department of Medical Microbiology and Immunology
A simple script
""", formatter_class=argparse.RawTextHelpFormatter)

parser.add_argument('-i', '--gene_cluster_genbank', help='Path to gene cluster genbank.', required=True)
args = parser.parse_args()
return args

myargs = create_parser()
gbk_file = myargs.gene_cluster_genbank

counter = 0
with open(gbk_file) as ogbf:
for rec in SeqIO.parse(ogbf, 'genbank'):
for feat in rec.features:
if not feat.type == 'CDS': continue
protein_id = None
try:
protein_id = feat.qualifiers.get('locus_tag')[0]
except:
pass

if protein_id == None:
try:
protein_id = feat.qualifiers.get('protein_id')[0]
except:
pass
if protein_id == None:
protein_id = 'CDS_' + str(counter)
counter += 1
print('>' + protein_id + '\n' + str(feat.qualifiers.get('translation')[0]))
4 changes: 2 additions & 2 deletions scripts/generateSyntenicVisual.py
Original file line number Diff line number Diff line change
Expand Up @@ -92,9 +92,9 @@ def genSynVis():
pif_handle = open(plot_input_file, 'w')
pif_handle.write('\t'.join(['OG', 'Start', 'End', 'Direction', 'SC', 'Metric']) + '\n')
for index, row in df.iterrows():
og = row['ortholog group (OG) ID']
og = row['Ortholog Group (OG) ID']
og_cons = row['Proportion of Total Gene Clusters with OG']
if float(og_cons) < 0.25: continue
#if float(og_cons) < 0.25: continue
og_mlen = float(row['OG Median Length (bp)'])
og_dir = row['OG Consensus Direction']
sc_flag = row['OG is Single Copy?']
Expand Down
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
import os

setup(name='zol',
version='1.3.18',
version='1.3.19',
description='',
url='http://github.com/Kalan-Lab/zol/',
author='Rauf Salamzade',
Expand Down
2 changes: 1 addition & 1 deletion zol/orthologs/findOrthologs.py
Original file line number Diff line number Diff line change
Expand Up @@ -109,7 +109,7 @@ def refactorProteomes(inputs):
def oneVsAllParse(inputs):
sample, samp_algn_res, samp_forth_res, identity_cutoff, coverage_cutoff, logObject = inputs

rbh_cmd = [rbh_prog, samp_algn_res, str(identity_cutoff), str(coverage_cutoff), '>', samp_forth_res]
rbh_cmd = [rbh_prog, samp_algn_res, str(identity_cutoff), str(coverage_cutoff), sample, '>', samp_forth_res]
try:
subprocess.call(' '.join(rbh_cmd), shell=True, stdout=subprocess.DEVNULL,
stderr=subprocess.DEVNULL, executable='/bin/bash')
Expand Down
Loading

0 comments on commit 4515225

Please sign in to comment.