Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Filter HMMs update #1881

Open
wants to merge 28 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
28 commits
Select commit Hold shift + click to select a range
9042b89
Filter hmm hits table if it has not been done already
mschecht Jan 28, 2022
2b68b69
adding a message to show that filter_HMMs already done
mschecht Jan 31, 2022
4b7302a
Merge branch 'master' into filter-HMMs-update
mschecht Jan 31, 2022
5683e92
update EcoPhylo test
mschecht Jan 31, 2022
82a7d21
parsing self values correctly
mschecht Jan 31, 2022
214c1c6
anvi-script-filter-hmm-hits-table records HMM domain filtering attrib…
mschecht Feb 2, 2022
ecfb79c
no needed
mschecht Feb 2, 2022
f908ea0
updated EcoPhylo to use new contigsDB self attributes
mschecht Feb 2, 2022
0a29d6d
fix
mschecht Feb 2, 2022
a14c999
more clear
mschecht Feb 3, 2022
752b13d
remove whitespace & unused or redefined libraries
meren Feb 3, 2022
bd0ba68
suppress output messages
meren Feb 3, 2022
ae346be
the artifact name is contigs-db, not contigsDB :)
meren Feb 3, 2022
2f7dedf
cosmetics
meren Feb 3, 2022
21e4cef
should not load contigsDB before checking domtable path
mschecht Feb 7, 2022
c04470f
fix
mschecht Feb 7, 2022
eed3714
warning to use hmmsearch with --domtblout if planning on using anvi-s…
mschecht Feb 7, 2022
3375156
sanity checks and better description of target and query coverage
mschecht Feb 7, 2022
8e2c02c
sanity check to see if domtblout came from same HMM_source being filt…
mschecht Feb 7, 2022
99ce247
clear error message
mschecht Feb 7, 2022
2f69ff9
fix
mschecht Feb 7, 2022
0c4bbf2
user may only provide one filtering parameter
mschecht Feb 7, 2022
8052c14
better warning
meren Feb 15, 2022
3bb5a9e
remove trailing whitespaces, contigsDBs, and unnecessary f-string
meren Feb 15, 2022
3f5ea74
make sure they are gene calls
meren Feb 15, 2022
f8d9b63
unused
meren Feb 15, 2022
6f6855a
parameterize `no_header` variable
meren Feb 15, 2022
37c92d1
the DOM table we filter here HAS a header.
meren Feb 15, 2022
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 2 additions & 3 deletions anvio/parsers/hmmer.py
Original file line number Diff line number Diff line change
Expand Up @@ -233,7 +233,6 @@ def find_line(self, condition):

def read_lines_until(self, condition, include_last=False, store=True):
lines = []
return_value = lines if store else True

for line in self.query_lines[self.line_no:]:
self.line_no += 1
Expand Down Expand Up @@ -535,7 +534,7 @@ class HMMERTableOutput(Parser):
Which HMMER program was used to generate the output we are parsing? Pick from {'hmmscan', 'hmmsearch'}
"""

def __init__(self, hmmer_table_txt, alphabet='AA', context='GENE', program='hmmscan', run=terminal.Run()):
def __init__(self, hmmer_table_txt, alphabet='AA', context='GENE', program='hmmscan', no_header=True, run=terminal.Run()):
self.alphabet = alphabet
self.context = context
self.program = program
Expand Down Expand Up @@ -563,7 +562,7 @@ def __init__(self, hmmer_table_txt, alphabet='AA', context='GENE', program='hmms
'col_names': col_names,
'col_mapping': col_mapping,
'indexing_field': -1,
'no_header': True,
'no_header': no_header,
},
}

Expand Down
16 changes: 7 additions & 9 deletions anvio/tests/run_workflow_tests_for_ecophylo.sh
Original file line number Diff line number Diff line change
Expand Up @@ -12,8 +12,7 @@ cp $files/data/genomes/bacteria/*.db $output_
cp $files/data/genomes/archaea/*.db $output_dir/workflow_test
cp $files/data/input_files/metagenomes.txt $output_dir/workflow_test
cp $files/data/input_files/external-genomes.txt $output_dir/workflow_test
cp $files/data/input_files/hmm_list.txt $output_dir/workflow_test
cp $files/data/input_files/hmm_list_external.txt $output_dir/workflow_test
cp $files/data/input_files/hmm_list* $output_dir/workflow_test
cd $output_dir/workflow_test

INFO "Creating a default config for ecophylo workflow"
Expand Down Expand Up @@ -50,7 +49,7 @@ anvi-run-workflow -w ecophylo -c only-metagenomes-txt-config.json -A --dry-run
INFO "Running ecophylo workflow with ecophylo dry-run: only external-genomes.txt"
anvi-run-workflow -w ecophylo -c only-external-genomes-txt-config.json -A --dry-run

INFO "Running ecophylo workflow"
INFO "EcoPhylo: profiling evolution AND ecology with Ribosomal_L16 on metagenomes and genomes"
anvi-run-workflow -w ecophylo -c default-config.json

INFO "Running ecophylo workflow interactive"
Expand All @@ -63,11 +62,11 @@ rm -rf $output_dir/workflow_test/ECOPHYLO_WORKFLOW/
INFO "Saving a workflow graph - no samples.txt"
anvi-run-workflow -w ecophylo -c no-samples-txt-config.json --save-workflow-graph

INFO "Running ecophylo workflow with ecophylo dry-run - no samples.txt"
INFO "EcoPhylo: profiling just evolution with Ribosomal_L16 and Ribosomal_L2 on metagenomes and genomes - no samples.txt (dry-run)"
anvi-run-workflow -w ecophylo -c no-samples-txt-config.json -A --dry-run

INFO "Running ecophylo workflow - no samples.txt"
anvi-run-workflow -w ecophylo -c no-samples-txt-config.json
INFO "EcoPhylo: profiling just evolution with Ribosomal_L16 and Ribosomal_L2 on metagenomes and genomes - no samples.txt"
anvi-run-workflow -w ecophylo -c no-samples-txt-config.json

INFO "Running ecophylo workflow interactive"
HMM="Ribosomal_L16"
Expand All @@ -82,6 +81,5 @@ anvi-run-workflow -w ecophylo -c only-external-genomes-txt-config.json

INFO "Running ecophylo workflow interactive from external HMM"
HMM="Ribosomal_L16"
anvi-interactive -t ECOPHYLO_WORKFLOW/05_TREES/"${HMM}"/"${HMM}"_renamed.nwk \
-p ECOPHYLO_WORKFLOW/05_TREES/"${HMM}"/"${HMM}"-PROFILE.db \
--manual
anvi-interactive -c ECOPHYLO_WORKFLOW/METAGENOMICS_WORKFLOW/03_CONTIGS/"${HMM}"-contigs.db \
-p ECOPHYLO_WORKFLOW/METAGENOMICS_WORKFLOW/06_MERGED/"${HMM}"/PROFILE.db
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
name source path
Ribosomal_L16 Bacteria_71 INTERNAL
Ribosomal_L2 Bacteria_71 INTERNAL
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
{
"metagenomes": "metagenomes.txt",
"external_genomes": "external-genomes.txt",
"hmm_list": "hmm_list.txt",
"hmm_list": "hmm_list-no-samples-txt.txt",
"cluster_representative_method": {
"method": "mmseqs"
},
Expand Down
112 changes: 62 additions & 50 deletions anvio/workflows/ecophylo/Snakefile
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@ rule anvi_run_hmms_hmmsearch:
log: os.path.join(dirs_dict['LOGS_DIR'], "anvi_run_hmms_hmmsearch-{sample_name}-{HMM}.log")
input:
output:
done = os.path.join(dirs_dict['EXTRACTED_RIBO_PROTEINS_DIR'], "{sample_name}-{HMM}-dom_hmmsearch/contigs-hmmsearch.done"),
done = os.path.join(dirs_dict['EXTRACTED_RIBO_PROTEINS_DIR'], "{sample_name}-{HMM}-contigs-hmmsearch.done"),
params:
hmm_source = M.get_param_value_from_config(['anvi_run_hmms_hmmsearch', '--installed-hmm-profile']),
additional_params = M.get_param_value_from_config(['trim_alignment', 'additional_params'])
Expand Down Expand Up @@ -116,20 +116,59 @@ rule filter_hmm_hits_by_query_coverage:
HMM_dir = os.path.join(M.HMM_path_dict[wildcards.HMM])
HMM_source = M.HMM_source_dict[wildcards.HMM]
domtblout = os.path.join(dirs_dict['EXTRACTED_RIBO_PROTEINS_DIR'], f"{wildcards.sample_name}-{HMM_source}-dom_hmmsearch/hmm.domtable")
hmmer_output_dir = os.path.join(dirs_dict['EXTRACTED_RIBO_PROTEINS_DIR'], f"{wildcards.sample_name}-{HMM_source}-dom_hmmsearch")

# Check if anvi-run-scg-taxonomy and/or anvi-script-filter-hmm-hits-table has been run already
contigs_db = ContigsDatabase(contigsDB)
try:
HMM_dom_filter_sources = contigs_db.meta['HMM_dom_filter_sources']
HMM_dom_filter_target_coverage = contigs_db.meta['HMM_dom_filter_target_coverage']
HMM_dom_filter_query_coverage = contigs_db.meta['HMM_dom_filter_query_coverage']

HMM_dom_filter_sources_list = HMM_dom_filter_sources.split(",")
HMM_dom_filter_target_coverage_list = HMM_dom_filter_target_coverage.split(",")
HMM_dom_filter_query_coverage_list = HMM_dom_filter_query_coverage.split(",")

source_domain_filter_values = list(zip(HMM_dom_filter_sources_list, HMM_dom_filter_target_coverage_list, HMM_dom_filter_query_coverage_list))

domain_filter_values_dict = {}
for item in source_domain_filter_values:
domain_filter_values_dict[item[0]] = item
except:
domain_filter_values_dict = {}

contigs_db.disconnect()

if HMM_source in M.internal_HMM_sources:
shell(f"anvi-script-filter-hmm-hits-table -c {contigsDB} \
--domain-hits-table {domtblout} \
--hmm-source {HMM_source} \
--query-coverage {params.query_coverage} \
{params.additional_params} 2> {log}")
if HMM_source in domain_filter_values_dict.keys():
if float(params.query_coverage) > float(domain_filter_values_dict[HMM_source][2]):
shell(f"anvi-script-filter-hmm-hits-table -c {contigsDB} \
--domain-hits-table {domtblout} \
--hmm-source {HMM_source} \
--query-coverage {params.query_coverage} \
{params.additional_params} 2> {log}")
else:
print(f"The HMM source {HMM_source} has already been filtered with more stringent query coverage value: {domain_filter_values_dict[HMM_source][2]}, skipping filter_hmm_hits_by_query_coverage!")
pass
else:
shell(f"anvi-script-filter-hmm-hits-table -c {contigsDB} \
--domain-hits-table {domtblout} \
--hmm-source {HMM_source} \
--query-coverage {params.query_coverage} \
{params.additional_params} 2> {log}")
else:
shell(f"anvi-script-filter-hmm-hits-table -c {contigsDB} \
--domain-hits-table {domtblout} \
--hmm-profile-dir {HMM_dir} \
--hmm-source {HMM_source} \
--query-coverage {params.query_coverage} \
{params.additional_params} 2> {log}")
if HMM_source in domain_filter_values_dict.keys():
if float(params.query_coverage) > float(domain_filter_values_dict[HMM_source][2]):
shell(f"anvi-script-filter-hmm-hits-table -c {contigsDB} \
--domain-hits-table {domtblout} \
--hmm-profile-dir {HMM_dir} \
--hmm-source {HMM_source} \
--query-coverage {params.query_coverage} \
{params.additional_params} 2> {log}")
else:
print(f"The HMM source {HMM_source} has already been filtered with more stringent query coverage value: {domain_filter_values_dict[HMM_source][2]}, skipping filter_hmm_hits_by_query_coverage!")
pass

shell('touch {output.done}')


Expand Down Expand Up @@ -968,7 +1007,7 @@ if M.samples_txt_file:
# basics
state_dict['version'] = '3'
state_dict['tree-type'] = 'phylogram'
state_dict['current-view'] = 'single'
state_dict['current-view'] = 'mean_coverage'

# height and width
# FIXME: It's unclear to me how the interactive interface determines
Expand Down Expand Up @@ -1066,22 +1105,23 @@ if M.samples_txt_file:
# views
views_dict = {}

single_dict = {}
mean_coverage_dict = {}

false = False
percent_identity = {
"normalization": "none",
"min": {
"value": "90",
"disabled": "false"
"disabled": false
},
"max": {
"value": "100",
"disabled": "false"
"disabled": false
}
}

single_dict['percent_identity'] = percent_identity
views_dict['single'] = single_dict
mean_coverage_dict['percent_identity'] = percent_identity
views_dict['mean_coverage'] = mean_coverage_dict
state_dict['views'] = views_dict

with open(output.state_file, "w") as outfile:
Expand Down Expand Up @@ -1177,23 +1217,16 @@ else:


# layer-orders
first_layers = ["__parent__", "length", "gc_content"]

layer_order = first_layers + misc_layers_list
if HMM_source in M.internal_HMM_sources:
layer_order = misc_layers_list + scg_taxonomy_layers_list
else:
layer_order = misc_layers_list

state_dict['layer-order'] = layer_order

# layers
layers_dict = {}

layer_attributes_parent = {
"color": "#000000",
"height": "0",
"margin": "15",
"type": "color",
"color-start": "#FFFFFF"
}

length = {
"color": "#000000",
"height": "0",
Expand All @@ -1202,22 +1235,6 @@ else:
"color-start": "#FFFFFF"
}

gc_content = {
"color": "#000000",
"height": "0",
"margin": "15",
"type": "color",
"color-start": "#FFFFFF"
}

identifier = {
"color": "#000000",
"height": "0",
"margin": "15",
"type": "color",
"color-start": "#FFFFFF"
}

names = {
"color": "#000000",
"height": "0",
Expand All @@ -1234,12 +1251,7 @@ else:
"color-start": "#FFFFFF"
}

layers_dict['__parent__'] = layer_attributes_parent
layers_dict['length'] = length
layers_dict['gc_content'] = gc_content
layers_dict['identifier'] = identifier
layers_dict['percent_identity'] = percent_identity

state_dict['layers'] = layers_dict

# views
Expand Down
15 changes: 14 additions & 1 deletion bin/anvi-run-hmms
Original file line number Diff line number Diff line change
Expand Up @@ -7,14 +7,15 @@ import sys
import anvio
import anvio.utils as utils
import anvio.terminal as terminal
import anvio.filesnpaths as filesnpaths

with terminal.SuppressAllOutput():
import anvio.data.hmm as hmm_data

available_hmm_sources = list(hmm_data.sources.keys())

from anvio.errors import ConfigError, FilesNPathsError
from anvio.terminal import time_program
from anvio.errors import ConfigError, FilesNPathsError
from anvio.tables.hmmhits import TablesForHMMHits
from anvio.tables.trnahits import TablesForTransferRNAs

Expand Down Expand Up @@ -90,6 +91,18 @@ def main(args):
raise ConfigError("We can see that you have requested --domain-hits-table but you haven't asked us to store "
"this output in a directory with --hmmer-output-dir. There is no point to requesting this output "
"if you are never going to see it, so we figured we'd stop you right there. :)")

if args.domain_hits_table and args.hmmer_program != "hmmsearch":
run.warning("You requested to save the --domtblout without using hmmsearch. We wanted to kindly warn you that if you "
"that if you plan on using `anvi-script-filter-hmm-hits-table` later to remove weak hits, you must use "
"`hmmsearch` instead of `hmmscan`. You can instruct anvi'o to switch to `hmmsearch` by including the "
"parameter `--hmmer-program hmmsearch` to your `anvi-run-hmms` command.")

domtable_path = os.path.join(args.hmmer_output_dir + "/hmm.table")
if filesnpaths.is_file_exists(domtable_path, dont_raise=True):
raise ConfigError(f"The file {domtable_path} already exists, and anvi'o does not like to "
"to overwrite things. Please either remove the file or rename your "
"desired output.")

search_tables = TablesForHMMHits(args.contigs_db, num_threads_to_use=args.num_threads, just_do_it=args.just_do_it,
hmm_program_to_use=args.hmmer_program, hmmer_output_directory=args.hmmer_output_dir,
Expand Down
Loading