From 9042b89874f645afd514a8b79580ba40a8bda98d Mon Sep 17 00:00:00 2001 From: mschechter Date: Fri, 28 Jan 2022 11:17:18 -0800 Subject: [PATCH 01/27] Filter hmm hits table if it has not been done already --- anvio/workflows/ecophylo/Snakefile | 105 ++++++++++------------ sandbox/anvi-script-filter-hmm-hits-table | 14 +++ 2 files changed, 62 insertions(+), 57 deletions(-) diff --git a/anvio/workflows/ecophylo/Snakefile b/anvio/workflows/ecophylo/Snakefile index 25662dd2eb..6750f4b5d3 100644 --- a/anvio/workflows/ecophylo/Snakefile +++ b/anvio/workflows/ecophylo/Snakefile @@ -44,7 +44,7 @@ rule anvi_run_hmms_hmmsearch: log: os.path.join(dirs_dict['LOGS_DIR'], "anvi_run_hmms_hmmsearch-{sample_name}-{HMM}.log") input: output: - done = os.path.join(dirs_dict['EXTRACTED_RIBO_PROTEINS_DIR'], "{sample_name}-{HMM}-dom_hmmsearch/contigs-hmmsearch.done"), + done = os.path.join(dirs_dict['EXTRACTED_RIBO_PROTEINS_DIR'], "{sample_name}-{HMM}-contigs-hmmsearch.done"), params: hmm_source = M.get_param_value_from_config(['anvi_run_hmms_hmmsearch', '--installed-hmm-profile']), additional_params = M.get_param_value_from_config(['trim_alignment', 'additional_params']) @@ -69,14 +69,14 @@ rule anvi_run_hmms_hmmsearch: else: pass - # Load contigsDB - contigs_db = ContigsDatabase(contigsDB) - # Check if anvi-run-scg-taxonomy has been run - scg_taxonomy_was_run_value = contigs_db.meta['scg_taxonomy_was_run'] + # Load contigsDB + contigs_db = ContigsDatabase(contigsDB) + # Check if anvi-run-scg-taxonomy has been run + scg_taxonomy_was_run_value = contigs_db.meta['scg_taxonomy_was_run'] - if scg_taxonomy_was_run_value != 1: - print(f"Running anvi-run-scg-taxonomy on {contigsDB} since the {wildcards.HMM} is in anvio's collection of SCGs") - shell("anvi-run-scg-taxonomy -c {contigsDB} --num-threads {threads} {params.additional_params}") + if scg_taxonomy_was_run_value != 1: + print(f"Running anvi-run-scg-taxonomy on {contigsDB} since the {wildcards.HMM} is in anvio's collection of SCGs") + shell("anvi-run-scg-taxonomy -c {contigsDB} --num-threads {threads} {params.additional_params}") else: if not os.path.exists(hmmer_output_dir): print(f"Running external HMM dataset: {wildcards.HMM}") @@ -115,20 +115,38 @@ rule filter_hmm_hits_by_query_coverage: HMM_dir = os.path.join(M.HMM_path_dict[wildcards.HMM]) HMM_source = M.HMM_source_dict[wildcards.HMM] domtblout = os.path.join(dirs_dict['EXTRACTED_RIBO_PROTEINS_DIR'], f"{wildcards.sample_name}-{HMM_source}-dom_hmmsearch/hmm.domtable") + hmmer_output_dir = os.path.join(dirs_dict['EXTRACTED_RIBO_PROTEINS_DIR'], f"{wildcards.sample_name}-{HMM_source}-dom_hmmsearch") + + # Check if anvi-run-scg-taxonomy and/or anvi-script-filter-hmm-hits-table has been run already + contigs_db = ContigsDatabase(contigsDB) + try: + HMM_source_domain_table_was_filtered = contigs_db.meta['HMM_source_domain_table_was_filtered'] + HMM_source_domain_table_was_filtered_list = ','.split(HMM_source_domain_table_was_filtered) + except: + HMM_source_domain_table_was_filtered_list = [] + + contigs_db.disconnect() if HMM_source in M.internal_HMM_sources: - shell(f"anvi-script-filter-hmm-hits-table -c {contigsDB} \ - --domain-hits-table {domtblout} \ - --hmm-source {HMM_source} \ - --query-coverage {params.query_coverage} \ - {params.additional_params} 2> {log}") + if HMM_source not in HMM_source_domain_table_was_filtered_list: + shell(f"anvi-script-filter-hmm-hits-table -c {contigsDB} \ + --domain-hits-table {domtblout} \ + --hmm-source {HMM_source} \ + --query-coverage {params.query_coverage} \ + {params.additional_params} 2> {log}") + else: + pass else: - shell(f"anvi-script-filter-hmm-hits-table -c {contigsDB} \ - --domain-hits-table {domtblout} \ - --hmm-profile-dir {HMM_dir} \ - --hmm-source {HMM_source} \ - --query-coverage {params.query_coverage} \ - {params.additional_params} 2> {log}") + if HMM_source not in HMM_source_domain_table_was_filtered_list: + shell(f"anvi-script-filter-hmm-hits-table -c {contigsDB} \ + --domain-hits-table {domtblout} \ + --hmm-profile-dir {HMM_dir} \ + --hmm-source {HMM_source} \ + --query-coverage {params.query_coverage} \ + {params.additional_params} 2> {log}") + else: + pass + shell('touch {output.done}') @@ -967,7 +985,7 @@ if M.samples_txt_file: # basics state_dict['version'] = '3' state_dict['tree-type'] = 'phylogram' - state_dict['current-view'] = 'single' + state_dict['current-view'] = 'mean_coverage' # height and width # FIXME: It's unclear to me how the interactive interface determines @@ -1065,22 +1083,23 @@ if M.samples_txt_file: # views views_dict = {} - single_dict = {} + mean_coverage_dict = {} + false = False percent_identity = { "normalization": "none", "min": { "value": "90", - "disabled": "false" + "disabled": false }, "max": { "value": "100", - "disabled": "false" + "disabled": false } } - single_dict['percent_identity'] = percent_identity - views_dict['single'] = single_dict + mean_coverage_dict['percent_identity'] = percent_identity + views_dict['mean_coverage'] = mean_coverage_dict state_dict['views'] = views_dict with open(output.state_file, "w") as outfile: @@ -1176,23 +1195,16 @@ else: # layer-orders - first_layers = ["__parent__", "length", "gc_content"] - - layer_order = first_layers + misc_layers_list + if HMM_source in M.internal_HMM_sources: + layer_order = misc_layers_list + scg_taxonomy_layers_list + else: + layer_order = misc_layers_list state_dict['layer-order'] = layer_order # layers layers_dict = {} - layer_attributes_parent = { - "color": "#000000", - "height": "0", - "margin": "15", - "type": "color", - "color-start": "#FFFFFF" - } - length = { "color": "#000000", "height": "0", @@ -1201,22 +1213,6 @@ else: "color-start": "#FFFFFF" } - gc_content = { - "color": "#000000", - "height": "0", - "margin": "15", - "type": "color", - "color-start": "#FFFFFF" - } - - identifier = { - "color": "#000000", - "height": "0", - "margin": "15", - "type": "color", - "color-start": "#FFFFFF" - } - names = { "color": "#000000", "height": "0", @@ -1233,12 +1229,7 @@ else: "color-start": "#FFFFFF" } - layers_dict['__parent__'] = layer_attributes_parent - layers_dict['length'] = length - layers_dict['gc_content'] = gc_content - layers_dict['identifier'] = identifier layers_dict['percent_identity'] = percent_identity - state_dict['layers'] = layers_dict # views diff --git a/sandbox/anvi-script-filter-hmm-hits-table b/sandbox/anvi-script-filter-hmm-hits-table index 734b5b5344..e82f181c4a 100755 --- a/sandbox/anvi-script-filter-hmm-hits-table +++ b/sandbox/anvi-script-filter-hmm-hits-table @@ -232,6 +232,20 @@ class FilterHmmHitsTable(object): hmm_tables.append_to_hmm_hits_table(source, reference, kind_of_search, domain, all_genes_searched_against, search_results_dict) + # Add meta value to show which HMM_source was filtered + self.db = db.DB(self.contigs_db_path, anvio.__contigs__version__, new_database=False) + sources_that_have_been_filtered = [self.db.get_meta_value('HMM_hits_domain_table_was_filtered', return_none_if_not_in_table=True)] + + if sources_that_have_been_filtered == None: + sources_that_have_been_filtered = [source] + self.db.set_meta_value('HMM_hits_domain_table_was_filtered', sources_that_have_been_filtered) + + if source not in sources_that_have_been_filtered: + sources_that_have_been_filtered.append(source) + + self.db.update_meta_value('HMM_hits_domain_table_was_filtered', str(sources_that_have_been_filtered)) + self.db.disconnect() + @terminal.time_program def main(args): From 2b68b69d85deef44b8c57d68bd366fa58bd86706 Mon Sep 17 00:00:00 2001 From: mschechter Date: Mon, 31 Jan 2022 09:19:38 -0800 Subject: [PATCH 02/27] adding a message to show that filter_HMMs already done --- anvio/workflows/ecophylo/Snakefile | 2 ++ 1 file changed, 2 insertions(+) diff --git a/anvio/workflows/ecophylo/Snakefile b/anvio/workflows/ecophylo/Snakefile index 6750f4b5d3..2d895ca89b 100644 --- a/anvio/workflows/ecophylo/Snakefile +++ b/anvio/workflows/ecophylo/Snakefile @@ -135,6 +135,7 @@ rule filter_hmm_hits_by_query_coverage: --query-coverage {params.query_coverage} \ {params.additional_params} 2> {log}") else: + print(f"HMM source {HMM_source} has already been filtered, skipping!") pass else: if HMM_source not in HMM_source_domain_table_was_filtered_list: @@ -145,6 +146,7 @@ rule filter_hmm_hits_by_query_coverage: --query-coverage {params.query_coverage} \ {params.additional_params} 2> {log}") else: + print(f"HMM source {HMM_source} has already been filtered, skipping!") pass shell('touch {output.done}') From 5683e9298295afd96c57569c2cbbe88eebf1cd87 Mon Sep 17 00:00:00 2001 From: mschechter Date: Mon, 31 Jan 2022 14:12:54 -0800 Subject: [PATCH 03/27] update EcoPhylo test --- anvio/tests/run_workflow_tests_for_ecophylo.sh | 16 +++++++--------- .../data/input_files/hmm_list-no-samples-txt.txt | 3 +++ .../ecophylo/no-samples-txt-config.json | 2 +- 3 files changed, 11 insertions(+), 10 deletions(-) create mode 100644 anvio/tests/sandbox/data/input_files/hmm_list-no-samples-txt.txt diff --git a/anvio/tests/run_workflow_tests_for_ecophylo.sh b/anvio/tests/run_workflow_tests_for_ecophylo.sh index ea5cafc1e3..77616be5f5 100755 --- a/anvio/tests/run_workflow_tests_for_ecophylo.sh +++ b/anvio/tests/run_workflow_tests_for_ecophylo.sh @@ -12,8 +12,7 @@ cp $files/data/genomes/bacteria/*.db $output_ cp $files/data/genomes/archaea/*.db $output_dir/workflow_test cp $files/data/input_files/metagenomes.txt $output_dir/workflow_test cp $files/data/input_files/external-genomes.txt $output_dir/workflow_test -cp $files/data/input_files/hmm_list.txt $output_dir/workflow_test -cp $files/data/input_files/hmm_list_external.txt $output_dir/workflow_test +cp $files/data/input_files/hmm_list* $output_dir/workflow_test cd $output_dir/workflow_test INFO "Creating a default config for ecophylo workflow" @@ -50,7 +49,7 @@ anvi-run-workflow -w ecophylo -c only-metagenomes-txt-config.json -A --dry-run INFO "Running ecophylo workflow with ecophylo dry-run: only external-genomes.txt" anvi-run-workflow -w ecophylo -c only-external-genomes-txt-config.json -A --dry-run -INFO "Running ecophylo workflow" +INFO "EcoPhylo: profiling evolution AND ecology with Ribosomal_L16 on metagenomes and genomes" anvi-run-workflow -w ecophylo -c default-config.json INFO "Running ecophylo workflow interactive" @@ -63,11 +62,11 @@ rm -rf $output_dir/workflow_test/ECOPHYLO_WORKFLOW/ INFO "Saving a workflow graph - no samples.txt" anvi-run-workflow -w ecophylo -c no-samples-txt-config.json --save-workflow-graph -INFO "Running ecophylo workflow with ecophylo dry-run - no samples.txt" +INFO "EcoPhylo: profiling just evolution with Ribosomal_L16 and Ribosomal_L2 on metagenomes and genomes - no samples.txt (dry-run)" anvi-run-workflow -w ecophylo -c no-samples-txt-config.json -A --dry-run -INFO "Running ecophylo workflow - no samples.txt" -anvi-run-workflow -w ecophylo -c no-samples-txt-config.json +INFO "EcoPhylo: profiling just evolution with Ribosomal_L16 and Ribosomal_L2 on metagenomes and genomes - no samples.txt" +anvi-run-workflow -w ecophylo -c no-samples-txt-config.json INFO "Running ecophylo workflow interactive" HMM="Ribosomal_L16" @@ -82,6 +81,5 @@ anvi-run-workflow -w ecophylo -c only-external-genomes-txt-config.json INFO "Running ecophylo workflow interactive from external HMM" HMM="Ribosomal_L16" -anvi-interactive -t ECOPHYLO_WORKFLOW/05_TREES/"${HMM}"/"${HMM}"_renamed.nwk \ - -p ECOPHYLO_WORKFLOW/05_TREES/"${HMM}"/"${HMM}"-PROFILE.db \ - --manual \ No newline at end of file +anvi-interactive -c ECOPHYLO_WORKFLOW/METAGENOMICS_WORKFLOW/03_CONTIGS/"${HMM}"-contigs.db \ + -p ECOPHYLO_WORKFLOW/METAGENOMICS_WORKFLOW/06_MERGED/"${HMM}"/PROFILE.db \ No newline at end of file diff --git a/anvio/tests/sandbox/data/input_files/hmm_list-no-samples-txt.txt b/anvio/tests/sandbox/data/input_files/hmm_list-no-samples-txt.txt new file mode 100644 index 0000000000..56b506ee2c --- /dev/null +++ b/anvio/tests/sandbox/data/input_files/hmm_list-no-samples-txt.txt @@ -0,0 +1,3 @@ +name source path +Ribosomal_L16 Bacteria_71 INTERNAL +Ribosomal_L2 Bacteria_71 INTERNAL diff --git a/anvio/tests/sandbox/workflows/ecophylo/no-samples-txt-config.json b/anvio/tests/sandbox/workflows/ecophylo/no-samples-txt-config.json index dc959d1b47..9bd6e10f41 100644 --- a/anvio/tests/sandbox/workflows/ecophylo/no-samples-txt-config.json +++ b/anvio/tests/sandbox/workflows/ecophylo/no-samples-txt-config.json @@ -1,7 +1,7 @@ { "metagenomes": "metagenomes.txt", "external_genomes": "external-genomes.txt", - "hmm_list": "hmm_list.txt", + "hmm_list": "hmm_list-no-samples-txt.txt", "cluster_representative_method": { "method": "mmseqs" }, From 82a7d211a84e7e6a9506f3c839a0bd7c19cedd96 Mon Sep 17 00:00:00 2001 From: mschechter Date: Mon, 31 Jan 2022 14:13:16 -0800 Subject: [PATCH 04/27] parsing self values correctly --- anvio/workflows/ecophylo/Snakefile | 20 ++++++++++---------- sandbox/anvi-script-filter-hmm-hits-table | 16 +++++++++------- 2 files changed, 19 insertions(+), 17 deletions(-) diff --git a/anvio/workflows/ecophylo/Snakefile b/anvio/workflows/ecophylo/Snakefile index 9bc5c523c7..813898f51d 100644 --- a/anvio/workflows/ecophylo/Snakefile +++ b/anvio/workflows/ecophylo/Snakefile @@ -121,34 +121,34 @@ rule filter_hmm_hits_by_query_coverage: # Check if anvi-run-scg-taxonomy and/or anvi-script-filter-hmm-hits-table has been run already contigs_db = ContigsDatabase(contigsDB) try: - HMM_source_domain_table_was_filtered = contigs_db.meta['HMM_source_domain_table_was_filtered'] - HMM_source_domain_table_was_filtered_list = ','.split(HMM_source_domain_table_was_filtered) + HMM_source_domain_table_was_filtered = contigs_db.meta['HMM_hits_domain_table_was_filtered'] + HMM_source_domain_table_was_filtered_list = HMM_source_domain_table_was_filtered.split(",") except: HMM_source_domain_table_was_filtered_list = [] contigs_db.disconnect() if HMM_source in M.internal_HMM_sources: - if HMM_source not in HMM_source_domain_table_was_filtered_list: + if HMM_source in HMM_source_domain_table_was_filtered_list: + print(f"The HMM source {HMM_source} has already been filtered, skipping filter_hmm_hits_by_query_coverage!") + pass + else: shell(f"anvi-script-filter-hmm-hits-table -c {contigsDB} \ --domain-hits-table {domtblout} \ --hmm-source {HMM_source} \ --query-coverage {params.query_coverage} \ {params.additional_params} 2> {log}") - else: - print(f"HMM source {HMM_source} has already been filtered, skipping!") - pass else: - if HMM_source not in HMM_source_domain_table_was_filtered_list: + if HMM_source in HMM_source_domain_table_was_filtered_list: + print(f"The HMM source {HMM_source} has already been filtered, skipping filter_hmm_hits_by_query_coverage!") + pass + else: shell(f"anvi-script-filter-hmm-hits-table -c {contigsDB} \ --domain-hits-table {domtblout} \ --hmm-profile-dir {HMM_dir} \ --hmm-source {HMM_source} \ --query-coverage {params.query_coverage} \ {params.additional_params} 2> {log}") - else: - print(f"HMM source {HMM_source} has already been filtered, skipping!") - pass shell('touch {output.done}') diff --git a/sandbox/anvi-script-filter-hmm-hits-table b/sandbox/anvi-script-filter-hmm-hits-table index e82f181c4a..3c7382dfa3 100755 --- a/sandbox/anvi-script-filter-hmm-hits-table +++ b/sandbox/anvi-script-filter-hmm-hits-table @@ -234,16 +234,18 @@ class FilterHmmHitsTable(object): # Add meta value to show which HMM_source was filtered self.db = db.DB(self.contigs_db_path, anvio.__contigs__version__, new_database=False) - sources_that_have_been_filtered = [self.db.get_meta_value('HMM_hits_domain_table_was_filtered', return_none_if_not_in_table=True)] + sources_that_have_been_filtered = self.db.get_meta_value('HMM_hits_domain_table_was_filtered', return_none_if_not_in_table=True) if sources_that_have_been_filtered == None: - sources_that_have_been_filtered = [source] - self.db.set_meta_value('HMM_hits_domain_table_was_filtered', sources_that_have_been_filtered) - - if source not in sources_that_have_been_filtered: - sources_that_have_been_filtered.append(source) + self.db.set_meta_value('HMM_hits_domain_table_was_filtered', source) + else: + sources_that_have_been_filtered_list = sources_that_have_been_filtered.split(",") + sources_that_have_been_filtered_set = set(sources_that_have_been_filtered_list) + sources_that_have_been_filtered_set.add(source) + updated_sources = ",".join(list(sources_that_have_been_filtered_set)) + self.db.update_meta_value('HMM_hits_domain_table_was_filtered', str(updated_sources)) - self.db.update_meta_value('HMM_hits_domain_table_was_filtered', str(sources_that_have_been_filtered)) + # self.db.update_meta_value('HMM_hits_domain_table_was_filtered', str(sources_that_have_been_filtered)) self.db.disconnect() From 214c1c6a186345fb850fffa47705ed7b623fe73a Mon Sep 17 00:00:00 2001 From: mschechter Date: Wed, 2 Feb 2022 07:54:55 -0800 Subject: [PATCH 05/27] anvi-script-filter-hmm-hits-table records HMM domain filtering attributes --- sandbox/anvi-script-filter-hmm-hits-table | 56 +++++++++++++++++++---- 1 file changed, 46 insertions(+), 10 deletions(-) diff --git a/sandbox/anvi-script-filter-hmm-hits-table b/sandbox/anvi-script-filter-hmm-hits-table index 3c7382dfa3..bf4b001fc0 100755 --- a/sandbox/anvi-script-filter-hmm-hits-table +++ b/sandbox/anvi-script-filter-hmm-hits-table @@ -233,19 +233,55 @@ class FilterHmmHitsTable(object): hmm_tables.append_to_hmm_hits_table(source, reference, kind_of_search, domain, all_genes_searched_against, search_results_dict) # Add meta value to show which HMM_source was filtered + # Self attributes that need to be added: + # - HMM_dom_filter_sources + # - HMM_dom_filter_target_coverage + # - HMM_dom_filter_query_coverage + self.db = db.DB(self.contigs_db_path, anvio.__contigs__version__, new_database=False) - sources_that_have_been_filtered = self.db.get_meta_value('HMM_hits_domain_table_was_filtered', return_none_if_not_in_table=True) - if sources_that_have_been_filtered == None: - self.db.set_meta_value('HMM_hits_domain_table_was_filtered', source) + HMM_dom_filter_sources = self.db.get_meta_value('HMM_dom_filter_sources', return_none_if_not_in_table=True) + HMM_dom_filter_target_coverage = self.db.get_meta_value('HMM_dom_filter_target_coverage', return_none_if_not_in_table=True) + HMM_dom_filter_query_coverage = self.db.get_meta_value('HMM_dom_filter_query_coverage', return_none_if_not_in_table=True) + + if HMM_dom_filter_sources == None: + self.db.set_meta_value('HMM_dom_filter_sources', source) + if self.target_coverage == None: + self.db.set_meta_value('HMM_dom_filter_target_coverage', 0.00) + else: + self.db.set_meta_value('HMM_dom_filter_target_coverage', self.target_coverage) + if self.query_coverage == None: + self.db.set_meta_value('HMM_dom_filter_query_coverage', 0.00) + else: + self.db.set_meta_value('HMM_dom_filter_query_coverage', self.query_coverage) else: - sources_that_have_been_filtered_list = sources_that_have_been_filtered.split(",") - sources_that_have_been_filtered_set = set(sources_that_have_been_filtered_list) - sources_that_have_been_filtered_set.add(source) - updated_sources = ",".join(list(sources_that_have_been_filtered_set)) - self.db.update_meta_value('HMM_hits_domain_table_was_filtered', str(updated_sources)) - - # self.db.update_meta_value('HMM_hits_domain_table_was_filtered', str(sources_that_have_been_filtered)) + HMM_dom_filter_sources_list = HMM_dom_filter_sources.split(",") + HMM_dom_filter_target_coverage_list = str(HMM_dom_filter_target_coverage).split(",") + HMM_dom_filter_query_coverage_list = str(HMM_dom_filter_query_coverage).split(",") + + source_domain_filter_values = list(zip(HMM_dom_filter_sources_list, HMM_dom_filter_target_coverage_list, HMM_dom_filter_query_coverage_list)) + + if self.target_coverage == None: + self.target_coverage = 0.00 + if self.query_coverage == None: + self.query_coverage = 0.00 + + new_filtering_parameters = (source, self.target_coverage, self.query_coverage) + + for i, item in enumerate(source_domain_filter_values): + if new_filtering_parameters[0] == item[0]: + if float(item[1]) < float(new_filtering_parameters[1]) or float(item[2]) < float(new_filtering_parameters[2]): + source_domain_filter_values[i] = new_filtering_parameters + + if new_filtering_parameters[0] not in HMM_dom_filter_sources_list: + source_domain_filter_values.append(new_filtering_parameters) + + updated_HMM_dom_filter_attributes = list(zip(*source_domain_filter_values)) + + self.db.set_meta_value('HMM_dom_filter_sources', ','.join(str(s) for s in updated_HMM_dom_filter_attributes[0])) + self.db.set_meta_value('HMM_dom_filter_target_coverage', ','.join(str(s) for s in updated_HMM_dom_filter_attributes[1])) + self.db.set_meta_value('HMM_dom_filter_query_coverage', ','.join(str(s) for s in updated_HMM_dom_filter_attributes[2])) + self.db.disconnect() From ecfb79c3ffaf08697d47987babe9dbedbc46e0e4 Mon Sep 17 00:00:00 2001 From: mschechter Date: Wed, 2 Feb 2022 15:54:33 -0800 Subject: [PATCH 06/27] no needed --- sandbox/anvi-script-filter-hmm-hits-table | 6 ------ 1 file changed, 6 deletions(-) diff --git a/sandbox/anvi-script-filter-hmm-hits-table b/sandbox/anvi-script-filter-hmm-hits-table index bf4b001fc0..4ef6d449f6 100755 --- a/sandbox/anvi-script-filter-hmm-hits-table +++ b/sandbox/anvi-script-filter-hmm-hits-table @@ -232,12 +232,6 @@ class FilterHmmHitsTable(object): hmm_tables.append_to_hmm_hits_table(source, reference, kind_of_search, domain, all_genes_searched_against, search_results_dict) - # Add meta value to show which HMM_source was filtered - # Self attributes that need to be added: - # - HMM_dom_filter_sources - # - HMM_dom_filter_target_coverage - # - HMM_dom_filter_query_coverage - self.db = db.DB(self.contigs_db_path, anvio.__contigs__version__, new_database=False) HMM_dom_filter_sources = self.db.get_meta_value('HMM_dom_filter_sources', return_none_if_not_in_table=True) From f908ea096ed1dcf6c83ad4e01f23330bacc8f29d Mon Sep 17 00:00:00 2001 From: mschechter Date: Wed, 2 Feb 2022 15:55:07 -0800 Subject: [PATCH 07/27] updated EcoPhylo to use new contigsDB self attributes --- anvio/workflows/ecophylo/Snakefile | 56 +++++++++++++++++++++--------- 1 file changed, 40 insertions(+), 16 deletions(-) diff --git a/anvio/workflows/ecophylo/Snakefile b/anvio/workflows/ecophylo/Snakefile index 813898f51d..79d0f67bf9 100644 --- a/anvio/workflows/ecophylo/Snakefile +++ b/anvio/workflows/ecophylo/Snakefile @@ -121,34 +121,58 @@ rule filter_hmm_hits_by_query_coverage: # Check if anvi-run-scg-taxonomy and/or anvi-script-filter-hmm-hits-table has been run already contigs_db = ContigsDatabase(contigsDB) try: - HMM_source_domain_table_was_filtered = contigs_db.meta['HMM_hits_domain_table_was_filtered'] - HMM_source_domain_table_was_filtered_list = HMM_source_domain_table_was_filtered.split(",") + HMM_dom_filter_sources = contigs_db.meta['HMM_dom_filter_sources'] + HMM_dom_filter_target_coverage = contigs_db.meta['HMM_dom_filter_target_coverage'] + HMM_dom_filter_query_coverage = contigs_db.meta['HMM_dom_filter_query_coverage'] + + HMM_dom_filter_sources_list = HMM_dom_filter_sources.split(",") + HMM_dom_filter_target_coverage_list = HMM_dom_filter_target_coverage.split(",") + HMM_dom_filter_query_coverage_list = HMM_dom_filter_query_coverage.split(",") + + source_domain_filter_values = list(zip(HMM_dom_filter_sources_list, HMM_dom_filter_target_coverage_list, HMM_dom_filter_query_coverage_list)) + + domain_filter_values_dict = {} + for item in source_domain_filter_values: + domain_filter_values_dict[item[0]] = item except: - HMM_source_domain_table_was_filtered_list = [] + domain_filter_values_dict = {} contigs_db.disconnect() if HMM_source in M.internal_HMM_sources: - if HMM_source in HMM_source_domain_table_was_filtered_list: - print(f"The HMM source {HMM_source} has already been filtered, skipping filter_hmm_hits_by_query_coverage!") - pass + if HMM_source in domain_filter_values_dict.keys(): + if float(params.query_coverage) > float(domain_filter_values_dict[HMM_source][2]): + print(params.query_coverage) + print(domain_filter_values_dict[HMM_source][2]) + print("shit") + shell(f"anvi-script-filter-hmm-hits-table -c {contigsDB} \ + --domain-hits-table {domtblout} \ + --hmm-source {HMM_source} \ + --query-coverage {params.query_coverage} \ + {params.additional_params} 2> {log}") + else: + print(f"The HMM source {HMM_source} has already been filtered with more stringent query coverage value: {domain_filter_values_dict[HMM_source][2]}, skipping filter_hmm_hits_by_query_coverage!") + pass else: + print("fuck") shell(f"anvi-script-filter-hmm-hits-table -c {contigsDB} \ --domain-hits-table {domtblout} \ --hmm-source {HMM_source} \ --query-coverage {params.query_coverage} \ {params.additional_params} 2> {log}") else: - if HMM_source in HMM_source_domain_table_was_filtered_list: - print(f"The HMM source {HMM_source} has already been filtered, skipping filter_hmm_hits_by_query_coverage!") - pass - else: - shell(f"anvi-script-filter-hmm-hits-table -c {contigsDB} \ - --domain-hits-table {domtblout} \ - --hmm-profile-dir {HMM_dir} \ - --hmm-source {HMM_source} \ - --query-coverage {params.query_coverage} \ - {params.additional_params} 2> {log}") + if HMM_source in domain_filter_values_dict.keys(): + if float(params.query_coverage) > float(domain_filter_values_dict[HMM_source][2]): + print("fuckkkkkk") + shell(f"anvi-script-filter-hmm-hits-table -c {contigsDB} \ + --domain-hits-table {domtblout} \ + --hmm-profile-dir {HMM_dir} \ + --hmm-source {HMM_source} \ + --query-coverage {params.query_coverage} \ + {params.additional_params} 2> {log}") + else: + print(f"The HMM source {HMM_source} has already been filtered with more stringent query coverage value: {domain_filter_values_dict[HMM_source][2]}, skipping filter_hmm_hits_by_query_coverage!") + pass shell('touch {output.done}') From 0a29d6de5ed21fe64eb6a43a8f9a1258bc9ddc62 Mon Sep 17 00:00:00 2001 From: mschechter Date: Wed, 2 Feb 2022 15:55:30 -0800 Subject: [PATCH 08/27] fix --- sandbox/anvi-script-filter-hmm-hits-table | 1 - 1 file changed, 1 deletion(-) diff --git a/sandbox/anvi-script-filter-hmm-hits-table b/sandbox/anvi-script-filter-hmm-hits-table index 4ef6d449f6..79a8a07444 100755 --- a/sandbox/anvi-script-filter-hmm-hits-table +++ b/sandbox/anvi-script-filter-hmm-hits-table @@ -213,7 +213,6 @@ class FilterHmmHitsTable(object): hmm_tables.remove_source(self.hmm_source) # Re-write hmm_hits table to contigsDB - internal_sources = list(anvio.data.hmm.sources.keys()) source = self.hmm_source From a14c9992d50d36e580aca31c6e2fdc8f28fa893e Mon Sep 17 00:00:00 2001 From: mschechter Date: Wed, 2 Feb 2022 16:03:03 -0800 Subject: [PATCH 09/27] more clear --- sandbox/anvi-script-filter-hmm-hits-table | 1 + 1 file changed, 1 insertion(+) diff --git a/sandbox/anvi-script-filter-hmm-hits-table b/sandbox/anvi-script-filter-hmm-hits-table index 79a8a07444..4e2f0615c8 100755 --- a/sandbox/anvi-script-filter-hmm-hits-table +++ b/sandbox/anvi-script-filter-hmm-hits-table @@ -231,6 +231,7 @@ class FilterHmmHitsTable(object): hmm_tables.append_to_hmm_hits_table(source, reference, kind_of_search, domain, all_genes_searched_against, search_results_dict) + # add contigsDB self attributes for HMM_source, target_coverage, and query_coverage self.db = db.DB(self.contigs_db_path, anvio.__contigs__version__, new_database=False) HMM_dom_filter_sources = self.db.get_meta_value('HMM_dom_filter_sources', return_none_if_not_in_table=True) From 752b13d5f70d64df157da7fe49e0b66772a6ddf3 Mon Sep 17 00:00:00 2001 From: "A. Murat Eren" Date: Thu, 3 Feb 2022 08:52:27 +0100 Subject: [PATCH 10/27] remove whitespace & unused or redefined libraries --- sandbox/anvi-script-filter-hmm-hits-table | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/sandbox/anvi-script-filter-hmm-hits-table b/sandbox/anvi-script-filter-hmm-hits-table index 4e2f0615c8..3dbc1b5513 100755 --- a/sandbox/anvi-script-filter-hmm-hits-table +++ b/sandbox/anvi-script-filter-hmm-hits-table @@ -8,7 +8,6 @@ is done using query and/or target coverage import sys import os -import numpy as np import pandas as pd import anvio @@ -16,13 +15,11 @@ import anvio.data.hmm import anvio.db as db import anvio.utils as utils import anvio.hmmops as hmmops -import anvio.bamops as bamops import anvio.terminal as terminal import anvio.filesnpaths as filesnpaths from anvio.dbops import ContigsDatabase from anvio.parsers import parser_modules -from anvio.argparse import ArgumentParser from anvio.tables.hmmhits import TablesForHMMHits from anvio.errors import ConfigError, FilesNPathsError @@ -269,7 +266,7 @@ class FilterHmmHitsTable(object): if new_filtering_parameters[0] not in HMM_dom_filter_sources_list: source_domain_filter_values.append(new_filtering_parameters) - + updated_HMM_dom_filter_attributes = list(zip(*source_domain_filter_values)) self.db.set_meta_value('HMM_dom_filter_sources', ','.join(str(s) for s in updated_HMM_dom_filter_attributes[0])) From bd0ba68aa870aacc762845b7b20a254997c85c52 Mon Sep 17 00:00:00 2001 From: "A. Murat Eren" Date: Thu, 3 Feb 2022 08:54:37 +0100 Subject: [PATCH 11/27] suppress output messages (plus we never import anvio libs like that) --- sandbox/anvi-script-filter-hmm-hits-table | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/sandbox/anvi-script-filter-hmm-hits-table b/sandbox/anvi-script-filter-hmm-hits-table index 3dbc1b5513..94b3f21405 100755 --- a/sandbox/anvi-script-filter-hmm-hits-table +++ b/sandbox/anvi-script-filter-hmm-hits-table @@ -11,13 +11,15 @@ import os import pandas as pd import anvio -import anvio.data.hmm import anvio.db as db import anvio.utils as utils import anvio.hmmops as hmmops import anvio.terminal as terminal import anvio.filesnpaths as filesnpaths +with terminal.SuppressAllOutput(): + import anvio.data.hmm as hmm_data + from anvio.dbops import ContigsDatabase from anvio.parsers import parser_modules from anvio.tables.hmmhits import TablesForHMMHits @@ -61,7 +63,7 @@ class FilterHmmHitsTable(object): if self.hmm_profile_dir: self.sources = utils.get_HMM_sources_dictionary([args.hmm_profile_dir]) else: - self.sources = anvio.data.hmm.sources + self.sources = hmm_data.sources def sanity_checks(self): """Sanity checks for program inputs.""" @@ -210,7 +212,7 @@ class FilterHmmHitsTable(object): hmm_tables.remove_source(self.hmm_source) # Re-write hmm_hits table to contigsDB - internal_sources = list(anvio.data.hmm.sources.keys()) + internal_sources = list(hmm_data.sources.keys()) source = self.hmm_source if self.sources not in internal_sources: @@ -219,7 +221,7 @@ class FilterHmmHitsTable(object): all_genes_searched_against = self.sources[source]['genes'] reference = self.sources[source]['ref'] else: - sources = anvio.data.hmm.sources + sources = hmm_data.sources source = self.hmm_source kind_of_search = sources[source]['kind'] domain = sources[source]['domain'] From ae346be6a1a9528bd2177608b7d37939b6a6a4b0 Mon Sep 17 00:00:00 2001 From: "A. Murat Eren" Date: Thu, 3 Feb 2022 08:57:13 +0100 Subject: [PATCH 12/27] the artifact name is contigs-db, not contigsDB :) --- sandbox/anvi-script-filter-hmm-hits-table | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/sandbox/anvi-script-filter-hmm-hits-table b/sandbox/anvi-script-filter-hmm-hits-table index 94b3f21405..be82151f25 100755 --- a/sandbox/anvi-script-filter-hmm-hits-table +++ b/sandbox/anvi-script-filter-hmm-hits-table @@ -80,8 +80,8 @@ class FilterHmmHitsTable(object): info_table = hmmops.SequencesForHMMHits(self.contigs_db_path).hmm_hits_info if self.hmm_source not in info_table: - raise ConfigError(f"Whoa there, the HMM source you provided, '{self.hmm_source}', is not in your contigsDB: " - f"{self.contigs_db_path}. Maybe you misspelled it? Maybe you never added it to your contigsDB??" + raise ConfigError(f"Whoa there, the HMM source you provided, '{self.hmm_source}', is not in your contigs-db: " + f"{self.contigs_db_path}. Maybe you misspelled it? Maybe you never added it to your contigs-db??" f"Please use --list-hmm-sources to see which HMM sources you have available. If you don't see the HMMs you " f"need then try re-running anvi-run-hmms and make sure to specify your HMM source of interest.") @@ -205,13 +205,13 @@ class FilterHmmHitsTable(object): def append_search_results_dict_to_hmm_tables(self, search_results_dict=None): - """Put in the new filtered hmm_hits table to the contigsDB""" + """Put in the new filtered hmm_hits table to the contigs-db""" # Remove old hmm_hits contigs_db_path hmm_tables = TablesForHMMHits(self.contigs_db_path) hmm_tables.remove_source(self.hmm_source) - # Re-write hmm_hits table to contigsDB + # Re-write hmm_hits table to contigs-db internal_sources = list(hmm_data.sources.keys()) source = self.hmm_source @@ -230,7 +230,7 @@ class FilterHmmHitsTable(object): hmm_tables.append_to_hmm_hits_table(source, reference, kind_of_search, domain, all_genes_searched_against, search_results_dict) - # add contigsDB self attributes for HMM_source, target_coverage, and query_coverage + # add contigs-db self attributes for HMM_source, target_coverage, and query_coverage self.db = db.DB(self.contigs_db_path, anvio.__contigs__version__, new_database=False) HMM_dom_filter_sources = self.db.get_meta_value('HMM_dom_filter_sources', return_none_if_not_in_table=True) From 2f7dedf66786799149ec7f6b06896689aef3a88c Mon Sep 17 00:00:00 2001 From: "A. Murat Eren" Date: Thu, 3 Feb 2022 08:57:30 +0100 Subject: [PATCH 13/27] cosmetics --- sandbox/anvi-script-filter-hmm-hits-table | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/sandbox/anvi-script-filter-hmm-hits-table b/sandbox/anvi-script-filter-hmm-hits-table index be82151f25..9c210ea140 100755 --- a/sandbox/anvi-script-filter-hmm-hits-table +++ b/sandbox/anvi-script-filter-hmm-hits-table @@ -75,7 +75,7 @@ class FilterHmmHitsTable(object): self.run.info("Domtblout Path", self.domtblout) if not self.hmm_source: - raise ConfigError("Please provide a hmm-source :)") + raise ConfigError("Please provide an hmm-source :)") info_table = hmmops.SequencesForHMMHits(self.contigs_db_path).hmm_hits_info @@ -155,7 +155,6 @@ class FilterHmmHitsTable(object): header=None, index_col=False) except Exception as e: - print(e) raise ConfigError(f"Doesn't look like a --domtblout... anvi'o can't even... " f"Please look at this error message to find out what happened: " f"{e}") From 21e4cef459c56465edb1d943641e1cb2ca0a7f3a Mon Sep 17 00:00:00 2001 From: mschechter Date: Mon, 7 Feb 2022 08:41:10 -0800 Subject: [PATCH 14/27] should not load contigsDB before checking domtable path --- bin/anvi-run-hmms | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/bin/anvi-run-hmms b/bin/anvi-run-hmms index d46491c872..5a577725f8 100755 --- a/bin/anvi-run-hmms +++ b/bin/anvi-run-hmms @@ -7,14 +7,15 @@ import sys import anvio import anvio.utils as utils import anvio.terminal as terminal +import anvio.filesnpaths as filesnpaths with terminal.SuppressAllOutput(): import anvio.data.hmm as hmm_data available_hmm_sources = list(hmm_data.sources.keys()) -from anvio.errors import ConfigError, FilesNPathsError from anvio.terminal import time_program +from anvio.errors import ConfigError, FilesNPathsError from anvio.tables.hmmhits import TablesForHMMHits from anvio.tables.trnahits import TablesForTransferRNAs @@ -91,6 +92,12 @@ def main(args): "this output in a directory with --hmmer-output-dir. There is no point to requesting this output " "if you are never going to see it, so we figured we'd stop you right there. :)") + domtable_path = os.path.join(args.hmmer_output_dir + "/hmm.domtable") + if filesnpaths.is_file_exists(domtable_path, dont_raise=True): + raise ConfigError(f"The file {file_path} already exists, and anvi'o does not like to " + "to overwrite things. Please either remove the file or rename your " + "desired output.") + search_tables = TablesForHMMHits(args.contigs_db, num_threads_to_use=args.num_threads, just_do_it=args.just_do_it, hmm_program_to_use=args.hmmer_program, hmmer_output_directory=args.hmmer_output_dir, get_domain_table_output=args.domain_hits_table, add_to_functions_table=args.add_to_functions_table) From c04470f4f2cd73e474bf19884de1230ba820083c Mon Sep 17 00:00:00 2001 From: mschechter Date: Mon, 7 Feb 2022 08:59:25 -0800 Subject: [PATCH 15/27] fix --- bin/anvi-run-hmms | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/bin/anvi-run-hmms b/bin/anvi-run-hmms index 5a577725f8..bfe3b03ac3 100755 --- a/bin/anvi-run-hmms +++ b/bin/anvi-run-hmms @@ -92,9 +92,9 @@ def main(args): "this output in a directory with --hmmer-output-dir. There is no point to requesting this output " "if you are never going to see it, so we figured we'd stop you right there. :)") - domtable_path = os.path.join(args.hmmer_output_dir + "/hmm.domtable") + domtable_path = os.path.join(args.hmmer_output_dir + "/hmm.table") if filesnpaths.is_file_exists(domtable_path, dont_raise=True): - raise ConfigError(f"The file {file_path} already exists, and anvi'o does not like to " + raise ConfigError(f"The file {domtable_path} already exists, and anvi'o does not like to " "to overwrite things. Please either remove the file or rename your " "desired output.") From eed37146097a2e916873b96ff4dcdcdbd59563ec Mon Sep 17 00:00:00 2001 From: mschechter Date: Mon, 7 Feb 2022 09:25:26 -0800 Subject: [PATCH 16/27] warning to use hmmsearch with --domtblout if planning on using anvi-script-filter-hmm-hits-table --- bin/anvi-run-hmms | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/bin/anvi-run-hmms b/bin/anvi-run-hmms index bfe3b03ac3..2805cdf6d2 100755 --- a/bin/anvi-run-hmms +++ b/bin/anvi-run-hmms @@ -91,6 +91,11 @@ def main(args): raise ConfigError("We can see that you have requested --domain-hits-table but you haven't asked us to store " "this output in a directory with --hmmer-output-dir. There is no point to requesting this output " "if you are never going to see it, so we figured we'd stop you right there. :)") + + if args.domain_hits_table and args.hmmer_program != "hmmsearch": + run.warning(f"You requested to save the --domtblout without using hmmsearch. We wanted to kindly warn you that if you " + f"that if you plan on using anvi-script-filter-hmm-hits-table that anvi'o needs the --domtblout from hmmsearch " + f"instead of hmmscan for example. You can switch to hmmsearch by using the --hmmer-program parameter :)") domtable_path = os.path.join(args.hmmer_output_dir + "/hmm.table") if filesnpaths.is_file_exists(domtable_path, dont_raise=True): From 3375156b569366f7a2f5a740381fe4d978ff52b4 Mon Sep 17 00:00:00 2001 From: mschechter Date: Mon, 7 Feb 2022 09:54:40 -0800 Subject: [PATCH 17/27] sanity checks and better description of target and query coverage --- sandbox/anvi-script-filter-hmm-hits-table | 18 +++++++++++++----- 1 file changed, 13 insertions(+), 5 deletions(-) diff --git a/sandbox/anvi-script-filter-hmm-hits-table b/sandbox/anvi-script-filter-hmm-hits-table index 9c210ea140..8f91330d4e 100755 --- a/sandbox/anvi-script-filter-hmm-hits-table +++ b/sandbox/anvi-script-filter-hmm-hits-table @@ -34,7 +34,7 @@ __authors__ = ['mschecht'] __provides__ = ["hmm-hits"] __requires__ = ["contigs-db","hmm-source", "hmm-hits"] __description__ = ("Filter weak HMM hits from a given contigs database using a domain hits table " - "reported by `anvi-run-hmms`.") + "reported by hmmsearch in `anvi-run-hmms`.") pp = terminal.pretty_print @@ -91,6 +91,12 @@ class FilterHmmHitsTable(object): raise ConfigError(f"The hmm-source {self.hmm_source} is not for amino acid sequences. " f"anvi-script-filter-hmm-hit-table currently can only work with hmm-sources " f"from protein sequences.") + + if 0.0 < self.target_coverage > 1.0: + raise ConfigError(f"Target coverage needs to be between 0 and 1") + + if 0.0 < self.query_coverage > 1.0: + raise ConfigError(f"Query coverage needs to be between 0 and 1") def process(self): @@ -292,10 +298,12 @@ if __name__ == '__main__': parser.add_argument(*anvio.A('list-hmm-sources'), **anvio.K('list-hmm-sources')) parser.add_argument(*anvio.A('hmm-profile-dir'), **anvio.K('hmm-profile-dir')) parser.add_argument('--domain-hits-table', metavar='PATH', help="Please provide the path to the domain-table-output. You can get this file from running anvi-run-hmms with the flag --domain-hits-table.") - parser.add_argument('--target-coverage', - help=" (ali_coord_to - ali_coord_from)/target_length") - parser.add_argument('--query-coverage', - help=" (hmm_coord_to - hmm_coord_from)/hmm_length") + parser.add_argument('--target-coverage', type=float, + help="The percent length (0.0-1.0) of the query sequence that must be aligned to the HMM model. " + "Here's the formula using columns from --domain-hits-table: (ali_coord_to - ali_coord_from)/target_length") + parser.add_argument('--query-coverage', type=float, + help="The percent length (0.0-1.0) of the target HMM model that is aligned query genes in the contigsDB. " + "Here's the formula using columns from --domain-hits-table: (hmm_coord_to - hmm_coord_from)/hmm_length") args, unknown = parser.parse_known_args() From 8e2c02c9e8d45241dc9f6a9d79e53ce30a6007f2 Mon Sep 17 00:00:00 2001 From: mschechter Date: Mon, 7 Feb 2022 14:13:50 -0800 Subject: [PATCH 18/27] sanity check to see if domtblout came from same HMM_source being filtered --- sandbox/anvi-script-filter-hmm-hits-table | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/sandbox/anvi-script-filter-hmm-hits-table b/sandbox/anvi-script-filter-hmm-hits-table index 8f91330d4e..6976cdb6f5 100755 --- a/sandbox/anvi-script-filter-hmm-hits-table +++ b/sandbox/anvi-script-filter-hmm-hits-table @@ -65,6 +65,7 @@ class FilterHmmHitsTable(object): else: self.sources = hmm_data.sources + def sanity_checks(self): """Sanity checks for program inputs.""" @@ -168,6 +169,7 @@ class FilterHmmHitsTable(object): self.df['query_coverage'] = ((self.df['hmm_stop'] - self.df['hmm_start'])/ self.df['hmm_length']) self.df['target_coverage'] = ((self.df['gene_stop'] - self.df['gene_start'])/ self.df['gene_length']) + self.hmm_names_set = set(self.df.hmm_name.to_list()) def filter_domtblout(self): """Filter the hmm_hits table based on query and/or target coverage""" @@ -212,6 +214,22 @@ class FilterHmmHitsTable(object): def append_search_results_dict_to_hmm_tables(self, search_results_dict=None): """Put in the new filtered hmm_hits table to the contigs-db""" + contigs_db = ContigsDatabase(self.contigs_db_path) + hmm_hits_dict = contigs_db.db.get_table_as_dict("hmm_hits") + + gene_list = [] + for key,value in hmm_hits_dict.items(): + if value['source'] == self.hmm_source: + gene_list.append(value['gene_name']) + + gene_set = set(gene_list) + + if not gene_set.issubset(self.hmm_names_set): + raise ConfigError(f"The genes in {self.domtblout} don't seem to be in the hmm_hits table " + f"from your contigsDB: {self.contigs_db_path}. " + f"Please double check you are filtering with the same HMM_source that you used " + f"with anvi-run-hmms") + # Remove old hmm_hits contigs_db_path hmm_tables = TablesForHMMHits(self.contigs_db_path) hmm_tables.remove_source(self.hmm_source) From 99ce247c8e79c449c2753fac0c0af5969aceb02f Mon Sep 17 00:00:00 2001 From: mschechter Date: Mon, 7 Feb 2022 14:18:40 -0800 Subject: [PATCH 19/27] clear error message --- sandbox/anvi-script-filter-hmm-hits-table | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sandbox/anvi-script-filter-hmm-hits-table b/sandbox/anvi-script-filter-hmm-hits-table index 6976cdb6f5..7bc3ebd6e2 100755 --- a/sandbox/anvi-script-filter-hmm-hits-table +++ b/sandbox/anvi-script-filter-hmm-hits-table @@ -228,7 +228,7 @@ class FilterHmmHitsTable(object): raise ConfigError(f"The genes in {self.domtblout} don't seem to be in the hmm_hits table " f"from your contigsDB: {self.contigs_db_path}. " f"Please double check you are filtering with the same HMM_source that you used " - f"with anvi-run-hmms") + f"to create {self.domtblout} when you ran anvi-run-hmms.") # Remove old hmm_hits contigs_db_path hmm_tables = TablesForHMMHits(self.contigs_db_path) From 2f69ff9689cbb045593ebc9ec5fa3d4c859fe7d4 Mon Sep 17 00:00:00 2001 From: mschechter Date: Mon, 7 Feb 2022 14:25:24 -0800 Subject: [PATCH 20/27] fix --- sandbox/anvi-script-filter-hmm-hits-table | 1 + 1 file changed, 1 insertion(+) diff --git a/sandbox/anvi-script-filter-hmm-hits-table b/sandbox/anvi-script-filter-hmm-hits-table index 7bc3ebd6e2..1f102cb6ac 100755 --- a/sandbox/anvi-script-filter-hmm-hits-table +++ b/sandbox/anvi-script-filter-hmm-hits-table @@ -216,6 +216,7 @@ class FilterHmmHitsTable(object): contigs_db = ContigsDatabase(self.contigs_db_path) hmm_hits_dict = contigs_db.db.get_table_as_dict("hmm_hits") + contigs_db.disconnect() gene_list = [] for key,value in hmm_hits_dict.items(): From 0c4bbf262b03a1706e60b2db4bbc337992b10904 Mon Sep 17 00:00:00 2001 From: mschechter Date: Mon, 7 Feb 2022 14:58:32 -0800 Subject: [PATCH 21/27] user may only provide one filtering parameter --- anvio/workflows/ecophylo/Snakefile | 5 ----- sandbox/anvi-script-filter-hmm-hits-table | 10 ++++++---- 2 files changed, 6 insertions(+), 9 deletions(-) diff --git a/anvio/workflows/ecophylo/Snakefile b/anvio/workflows/ecophylo/Snakefile index 79d0f67bf9..8617c27884 100644 --- a/anvio/workflows/ecophylo/Snakefile +++ b/anvio/workflows/ecophylo/Snakefile @@ -142,9 +142,6 @@ rule filter_hmm_hits_by_query_coverage: if HMM_source in M.internal_HMM_sources: if HMM_source in domain_filter_values_dict.keys(): if float(params.query_coverage) > float(domain_filter_values_dict[HMM_source][2]): - print(params.query_coverage) - print(domain_filter_values_dict[HMM_source][2]) - print("shit") shell(f"anvi-script-filter-hmm-hits-table -c {contigsDB} \ --domain-hits-table {domtblout} \ --hmm-source {HMM_source} \ @@ -154,7 +151,6 @@ rule filter_hmm_hits_by_query_coverage: print(f"The HMM source {HMM_source} has already been filtered with more stringent query coverage value: {domain_filter_values_dict[HMM_source][2]}, skipping filter_hmm_hits_by_query_coverage!") pass else: - print("fuck") shell(f"anvi-script-filter-hmm-hits-table -c {contigsDB} \ --domain-hits-table {domtblout} \ --hmm-source {HMM_source} \ @@ -163,7 +159,6 @@ rule filter_hmm_hits_by_query_coverage: else: if HMM_source in domain_filter_values_dict.keys(): if float(params.query_coverage) > float(domain_filter_values_dict[HMM_source][2]): - print("fuckkkkkk") shell(f"anvi-script-filter-hmm-hits-table -c {contigsDB} \ --domain-hits-table {domtblout} \ --hmm-profile-dir {HMM_dir} \ diff --git a/sandbox/anvi-script-filter-hmm-hits-table b/sandbox/anvi-script-filter-hmm-hits-table index 1f102cb6ac..98d5c47a4c 100755 --- a/sandbox/anvi-script-filter-hmm-hits-table +++ b/sandbox/anvi-script-filter-hmm-hits-table @@ -93,11 +93,13 @@ class FilterHmmHitsTable(object): f"anvi-script-filter-hmm-hit-table currently can only work with hmm-sources " f"from protein sequences.") - if 0.0 < self.target_coverage > 1.0: - raise ConfigError(f"Target coverage needs to be between 0 and 1") + if self.target_coverage: + if 0.0 < float(self.target_coverage) > 1.0: + raise ConfigError(f"Target coverage needs to be between 0 and 1") - if 0.0 < self.query_coverage > 1.0: - raise ConfigError(f"Query coverage needs to be between 0 and 1") + if self.query_coverage: + if 0.0 < float(self.query_coverage) > 1.0: + raise ConfigError(f"Query coverage needs to be between 0 and 1") def process(self): From 8052c1418cd9ed08104502a6e6f75f4f1ea7b5f4 Mon Sep 17 00:00:00 2001 From: "A. Murat Eren" Date: Tue, 15 Feb 2022 16:50:48 +0100 Subject: [PATCH 22/27] better warning --- bin/anvi-run-hmms | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/bin/anvi-run-hmms b/bin/anvi-run-hmms index 2805cdf6d2..384259b65a 100755 --- a/bin/anvi-run-hmms +++ b/bin/anvi-run-hmms @@ -93,9 +93,10 @@ def main(args): "if you are never going to see it, so we figured we'd stop you right there. :)") if args.domain_hits_table and args.hmmer_program != "hmmsearch": - run.warning(f"You requested to save the --domtblout without using hmmsearch. We wanted to kindly warn you that if you " - f"that if you plan on using anvi-script-filter-hmm-hits-table that anvi'o needs the --domtblout from hmmsearch " - f"instead of hmmscan for example. You can switch to hmmsearch by using the --hmmer-program parameter :)") + run.warning("You requested to save the --domtblout without using hmmsearch. We wanted to kindly warn you that if you " + "that if you plan on using `anvi-script-filter-hmm-hits-table` later to remove weak hits, you must use " + "`hmmsearch` instead of `hmmscan`. You can instruct anvi'o to switch to `hmmsearch` by including the " + "parameter `--hmmer-program hmmsearch` to your `anvi-run-hmms` command.") domtable_path = os.path.join(args.hmmer_output_dir + "/hmm.table") if filesnpaths.is_file_exists(domtable_path, dont_raise=True): From 3bb5a9eb39881272dc7a43fe2c4414079a6636c8 Mon Sep 17 00:00:00 2001 From: "A. Murat Eren" Date: Tue, 15 Feb 2022 16:51:19 +0100 Subject: [PATCH 23/27] remove trailing whitespaces, contigsDBs, and unnecessary f-string notations --- sandbox/anvi-script-filter-hmm-hits-table | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/sandbox/anvi-script-filter-hmm-hits-table b/sandbox/anvi-script-filter-hmm-hits-table index 98d5c47a4c..61433503f9 100755 --- a/sandbox/anvi-script-filter-hmm-hits-table +++ b/sandbox/anvi-script-filter-hmm-hits-table @@ -92,14 +92,14 @@ class FilterHmmHitsTable(object): raise ConfigError(f"The hmm-source {self.hmm_source} is not for amino acid sequences. " f"anvi-script-filter-hmm-hit-table currently can only work with hmm-sources " f"from protein sequences.") - + if self.target_coverage: if 0.0 < float(self.target_coverage) > 1.0: - raise ConfigError(f"Target coverage needs to be between 0 and 1") + raise ConfigError("Target coverage needs to be between 0 and 1") if self.query_coverage: if 0.0 < float(self.query_coverage) > 1.0: - raise ConfigError(f"Query coverage needs to be between 0 and 1") + raise ConfigError("Query coverage needs to be between 0 and 1") def process(self): @@ -224,12 +224,12 @@ class FilterHmmHitsTable(object): for key,value in hmm_hits_dict.items(): if value['source'] == self.hmm_source: gene_list.append(value['gene_name']) - + gene_set = set(gene_list) if not gene_set.issubset(self.hmm_names_set): raise ConfigError(f"The genes in {self.domtblout} don't seem to be in the hmm_hits table " - f"from your contigsDB: {self.contigs_db_path}. " + f"from your contigs-db: {self.contigs_db_path}. " f"Please double check you are filtering with the same HMM_source that you used " f"to create {self.domtblout} when you ran anvi-run-hmms.") @@ -256,7 +256,7 @@ class FilterHmmHitsTable(object): hmm_tables.append_to_hmm_hits_table(source, reference, kind_of_search, domain, all_genes_searched_against, search_results_dict) - # add contigs-db self attributes for HMM_source, target_coverage, and query_coverage + # add contigs-db self attributes for HMM_source, target_coverage, and query_coverage self.db = db.DB(self.contigs_db_path, anvio.__contigs__version__, new_database=False) HMM_dom_filter_sources = self.db.get_meta_value('HMM_dom_filter_sources', return_none_if_not_in_table=True) @@ -323,7 +323,7 @@ if __name__ == '__main__': help="The percent length (0.0-1.0) of the query sequence that must be aligned to the HMM model. " "Here's the formula using columns from --domain-hits-table: (ali_coord_to - ali_coord_from)/target_length") parser.add_argument('--query-coverage', type=float, - help="The percent length (0.0-1.0) of the target HMM model that is aligned query genes in the contigsDB. " + help="The percent length (0.0-1.0) of the target HMM model that is aligned query genes in the contigs-db. " "Here's the formula using columns from --domain-hits-table: (hmm_coord_to - hmm_coord_from)/hmm_length") args, unknown = parser.parse_known_args() From 3f5ea746620e478c2afcbeff976ceb70d52fc2ca Mon Sep 17 00:00:00 2001 From: "A. Murat Eren" Date: Tue, 15 Feb 2022 16:51:44 +0100 Subject: [PATCH 24/27] make sure they are gene calls --- sandbox/anvi-script-filter-hmm-hits-table | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/sandbox/anvi-script-filter-hmm-hits-table b/sandbox/anvi-script-filter-hmm-hits-table index 61433503f9..b077e6b429 100755 --- a/sandbox/anvi-script-filter-hmm-hits-table +++ b/sandbox/anvi-script-filter-hmm-hits-table @@ -152,6 +152,15 @@ class FilterHmmHitsTable(object): ('description', str), # description of target ] + try: + [int(l.split()[0]) for l in open(self.domtblout)][1:] + except ValueError as e: + raise ConfigError(f"The data in the first column of your DOM table output ('{self.domtblout}') " + f"do not look like anv'o gene caller ids (we know that because Python complained " + f"saying '{e}', which should never be the case with proper anvi'o gene caller ids). " + f"This outcome is only possible if you run `anvi-run-hmms` without the parameter " + f"`--hmmer-program hmmsearch`. No filters for you.") + try: colnames_coltypes_list = list(zip(*col_info)) colnames_coltypes_dict = dict(zip(colnames_coltypes_list[0], colnames_coltypes_list[1])) From f8d9b63b2c3e866ab4347d1cafefd64dd92ed18e Mon Sep 17 00:00:00 2001 From: "A. Murat Eren" Date: Tue, 15 Feb 2022 16:54:42 +0100 Subject: [PATCH 25/27] unused --- anvio/parsers/hmmer.py | 1 - 1 file changed, 1 deletion(-) diff --git a/anvio/parsers/hmmer.py b/anvio/parsers/hmmer.py index 7dbd0828a9..8bf681a41d 100644 --- a/anvio/parsers/hmmer.py +++ b/anvio/parsers/hmmer.py @@ -233,7 +233,6 @@ def find_line(self, condition): def read_lines_until(self, condition, include_last=False, store=True): lines = [] - return_value = lines if store else True for line in self.query_lines[self.line_no:]: self.line_no += 1 From 6f6855a4015d85d488ffa216a49e0c39df74850b Mon Sep 17 00:00:00 2001 From: "A. Murat Eren" Date: Tue, 15 Feb 2022 18:57:09 +0100 Subject: [PATCH 26/27] parameterize `no_header` variable --- anvio/parsers/hmmer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/anvio/parsers/hmmer.py b/anvio/parsers/hmmer.py index 8bf681a41d..17cde1737e 100644 --- a/anvio/parsers/hmmer.py +++ b/anvio/parsers/hmmer.py @@ -534,7 +534,7 @@ class HMMERTableOutput(Parser): Which HMMER program was used to generate the output we are parsing? Pick from {'hmmscan', 'hmmsearch'} """ - def __init__(self, hmmer_table_txt, alphabet='AA', context='GENE', program='hmmscan', run=terminal.Run()): + def __init__(self, hmmer_table_txt, alphabet='AA', context='GENE', program='hmmscan', no_header=True, run=terminal.Run()): self.alphabet = alphabet self.context = context self.program = program From 37c92d14d02d706e64228a394dfc90deee6a0227 Mon Sep 17 00:00:00 2001 From: "A. Murat Eren" Date: Tue, 15 Feb 2022 18:57:50 +0100 Subject: [PATCH 27/27] the DOM table we filter here HAS a header. --- anvio/parsers/hmmer.py | 2 +- sandbox/anvi-script-filter-hmm-hits-table | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/anvio/parsers/hmmer.py b/anvio/parsers/hmmer.py index 17cde1737e..db37e72663 100644 --- a/anvio/parsers/hmmer.py +++ b/anvio/parsers/hmmer.py @@ -562,7 +562,7 @@ def __init__(self, hmmer_table_txt, alphabet='AA', context='GENE', program='hmms 'col_names': col_names, 'col_mapping': col_mapping, 'indexing_field': -1, - 'no_header': True, + 'no_header': no_header, }, } diff --git a/sandbox/anvi-script-filter-hmm-hits-table b/sandbox/anvi-script-filter-hmm-hits-table index b077e6b429..cf4e1c0d8d 100755 --- a/sandbox/anvi-script-filter-hmm-hits-table +++ b/sandbox/anvi-script-filter-hmm-hits-table @@ -216,7 +216,7 @@ class FilterHmmHitsTable(object): context= 'DOMAIN' hmm_program = 'hmmsearch' - parser = parser_modules['search']['hmmer_table_output'](hmmsearch_tbl, alphabet=alphabet, context=context, program=hmm_program) + parser = parser_modules['search']['hmmer_table_output'](hmmsearch_tbl, alphabet=alphabet, context=context, program=hmm_program, no_header=False) search_results_dict = parser.get_search_results() return search_results_dict