From e279d43dc867294c31962e91240ae0f19b4220c9 Mon Sep 17 00:00:00 2001 From: Silvio Waschina Date: Wed, 10 Jan 2024 15:18:31 +0100 Subject: [PATCH 01/37] Correction of typos --- atlas/atlas.py | 2 +- atlas/init/create_sample_table.py | 2 +- atlas/init/parse_sra.py | 6 +-- docs/usage/configuration.rst | 12 ++--- docs/usage/getting_started.rst | 46 +++++++++---------- workflow/report/template_QC_report.html | 12 ++--- workflow/report/template_assembly_report.html | 4 +- workflow/report/template_bin_report.html | 10 ++-- workflow/rules/binning.smk | 2 +- workflow/scripts/parse_vamb.py | 2 +- workflow/scripts/utils/io.py | 2 +- 11 files changed, 50 insertions(+), 50 deletions(-) diff --git a/atlas/atlas.py b/atlas/atlas.py index 3e7ee05c..330d1487 100644 --- a/atlas/atlas.py +++ b/atlas/atlas.py @@ -121,7 +121,7 @@ def get_snakefile(file="workflow/Snakefile"): type=int, default=multiprocessing.cpu_count(), show_default=True, - help="use at most this many jobs in parallel (see cluster submission for mor details).", + help="use at most this many jobs in parallel (see cluster submission for more details).", ) @click.option( "--max-mem", diff --git a/atlas/init/create_sample_table.py b/atlas/init/create_sample_table.py index 8c9ee2f2..734c6cab 100644 --- a/atlas/init/create_sample_table.py +++ b/atlas/init/create_sample_table.py @@ -158,7 +158,7 @@ def get_samples_from_fastq(path, fraction_split_character=split_character): # parse subfolder if len(subfolders) > 0: logger.info( - f"Found {len(subfolders)} subfolders. Check if I find fastq files inside. Use the the subfolder as sample_names " + f"Found {len(subfolders)} subfolders. Check if I find fastq files inside. Use the subfolder as sample_names " ) for subf in subfolders: diff --git a/atlas/init/parse_sra.py b/atlas/init/parse_sra.py index 953c03b6..0c2c41ff 100644 --- a/atlas/init/parse_sra.py +++ b/atlas/init/parse_sra.py @@ -67,7 +67,7 @@ def filter_runinfo(RunTable, ignore_paired=False): if Difference > 0: logger.info( - f"Runs have the folowing values for {key}: {', '.join(All_values)}\n" + f"Runs have the following values for {key}: {', '.join(All_values)}\n" f"Select only runs {key} == {Expected_library_values[key]}, " f"Filtered out {Difference} runs" ) @@ -77,7 +77,7 @@ def filter_runinfo(RunTable, ignore_paired=False): All_values = RunTable[key].unique() if any(RunTable[key] != Expected_library_values[key]): logger.warning( - f"Runs have the folowing values for {key}: {', '.join(All_values)}\n" + f"Runs have the following values for {key}: {', '.join(All_values)}\n" f"Usually I expect {key} == {Expected_library_values[key]} " ) @@ -141,7 +141,7 @@ def validate_merging_runinfo(path): logger.error( f"You attemt to merge runs from the same sample. " f"But for {len(problematic_samples)} samples the runs are sequenced with different platforms and should't be merged.\n" - f"Please resolve the the abiguity in the table {path} and rerun the command.\n" + f"Please resolve the abiguity in the table {path} and rerun the command.\n" ) exit(1) diff --git a/docs/usage/configuration.rst b/docs/usage/configuration.rst index 836a3456..0dc88b10 100644 --- a/docs/usage/configuration.rst +++ b/docs/usage/configuration.rst @@ -11,10 +11,10 @@ _contaminants: Remove reads from Host ====================== -One of the most important steps in the Quality control is to remove host genome. +One of the most important steps in the Quality control is to remove reads from the host's genome. You can add any number of genomes to be removed. -We recommend you to use genomes where repetitive sequences are masked. +We recommend using genomes where repetitive sequences are masked. See here for more details `human genome `_. @@ -36,7 +36,7 @@ There are two primary strategies for co-abundance binning: The samples to be binned together are specified using the `BinGroup` in the `sample.tsv` file. The size of the BinGroup should be selected based on the binner and the co-binning strategy in use. -Cross mapping complexity scales quadratically with the size of the BinGroup since each sample's reads are mapped to each other. +Cross-mapping complexity scales quadratically with the size of the BinGroup since each sample's reads are mapped to each other. This might yield better results for complex metagenomes, although no definitive benchmark is known. On the other hand, co-binning is more efficient, as it maps a sample's reads only once to a potentially large assembly. @@ -88,12 +88,12 @@ Long reads ========== Limitation: Hybrid assembly of long and short reads is supported with spades and metaSpades. -However metaSpades needs a paired-end short-read library. +However, metaSpades needs a paired-end short-read library. The path of the (preprocessed) long reads should be added manually to the -the sample table under a new column heading 'longreads'. +sample table under a new column heading 'longreads'. -In addition the type of the long reads should be defined in the config file: +In addition, the type of the long reads should be defined in the config file: ``longread_type`` one of ["pacbio", "nanopore", "sanger", "trusted-contigs", "untrusted-contigs"] diff --git a/docs/usage/getting_started.rst b/docs/usage/getting_started.rst index d8fbd56f..3c53f42b 100644 --- a/docs/usage/getting_started.rst +++ b/docs/usage/getting_started.rst @@ -11,7 +11,7 @@ Conda package manager --------------------- Atlas has **one dependency**: conda_. All databases and other dependencies are installed **on the fly**. -Atlas is based on snakemake which allows to run steps of the workflow in parallel on a cluster. +Atlas is based on snakemake, which allows to run steps of the workflow in parallel on a cluster. If you want to try atlas and have a linux computer (OSX may also work), you can use our `example data`_ for testing. @@ -20,7 +20,7 @@ For real metagenomic data atlas should be run on a _linux_ sytem, with enough me You need to install `anaconda `_ or miniconda. -If you haven't done it already you need to configure conda with the bioconda-channel and the conda-forge channel. This are sources for packages beyond the default one. +If you haven't done it already, you need to configure conda with the bioconda-channel and the conda-forge channel. This are sources for packages beyond the default one. Setting strict channel priority can prevent quite some annoyances. .. code-block:: bash @@ -38,12 +38,12 @@ Conda can be a bit slow because there are so many packages. A good way around th conda install mamba -From now on you can replace ``conda install`` with ``mamba install`` and see how much faster this snake is. +From now on, you can replace ``conda install`` with ``mamba install`` and see how much faster this snake is. Install metagenome-atlas ------------------------ -We recommend you to install metagenome-atlas into a conda environment e.g. named ``atlasenv`` +We recommend to install metagenome-atlas into a conda environment e.g. named ``atlasenv``. We also recommend to specify the latest version of metagenome-atlas. .. code-block:: bash @@ -62,7 +62,7 @@ where `{latest_version}` should be replaced by Install metagenome-atlas from GitHub ------------------------------------ -Alternatively you can install metagenome Atlas directly form GitHub. This allows you to access versions that are not yet in the conda release, e.g. versions that are still in development. +Alternatively, you can install metagenome Atlas directly from GitHub. This allows you to access versions that are not yet in the conda release, e.g. versions that are still in development. .. code-block:: bash @@ -76,7 +76,7 @@ Alternatively you can install metagenome Atlas directly form GitHub. This allows mamba env create -n atlas-dev --file atlasenv.yml conda activate atlas-dev - # install atlas version. Changes in this files are directly available in the atlas dev version + # install atlas version. Changes in the files are directly available in the atlas dev version pip install --editable . cd .. @@ -89,9 +89,9 @@ Alternatively you can install metagenome Atlas directly form GitHub. This allows Example Data ============ -If you want to test atlas on a small example data here is a two sample, three genome minimal metagenome dataset, +If you want to test atlas on a small example data, here is a two sample, three genome minimal metagenome dataset, to test atlas. Even when atlas will run faster on the test data, -it will anyway download all the databases and requirements, for the a complete run, +it will anyway download all the databases and requirements, for a complete run, which can take a certain amount of time and especially disk space (>100Gb). The database dir of the test run should be the same as for the later atlas executions. @@ -119,13 +119,13 @@ This command parses the folder for fastq files (extension ``.fastq(.gz)`` or ``. The command creates a ``samples.tsv`` and a ``config.yaml`` in the working directory. -Have a look at them with a normal text editor and check if the samples names are inferred correctly. The sample names are used for the naming of contigs, genes, and genomes. Therefore, the sample names should consist only form digits and letters and start with a letter (Even though one ``-`` is allowed). Atlas tries to simplify the file name to obtain unique sample names, if it doesn't succeed it simply puts S1, S2, ... as sample names. +Have a look at them with a normal text editor and check if the sample names are inferred correctly. The sample names are used for the naming of contigs, genes, and genomes. Therefore, the sample names should consist only of digits and letters and start with a letter (Even though one ``-`` is allowed). Atlas tries to simplify the file name to obtain unique sample names, if it doesn't succeed it simply puts S1, S2, ... as sample names. See the :download:`example sample table <../reports/samples.tsv>` The ``BinGroup`` parameter is used during the genomic binning. -In short: If you have between 5 and 150 samples the default (puting everithing in one group) is fine. +In short: If you have between 5 and 150 samples the default (putting everything in one group) is fine. If you have less than 5 samples, put every sample in an individual BinGroup and use `metabat` as final binner. If you have more samples see the :ref:`cobinning` section for more details. @@ -180,11 +180,11 @@ Since v2.9 atlas has possibility to start a new project from public data stored You can run ``atlas init-public `` and specify any ids, like bioprojects, or other SRA ids. -Atlas does the folowing steps: +Atlas does the following steps: - 1. Search SRA for the corresponding sequences (Runs) and save them in the file ``SRA/RunInfo_original.tsv``. For example if you specify a Bioproject, it fetches the information for all runs of this project. + 1. Search SRA for the corresponding sequences (Runs) and save them in the file ``SRA/RunInfo_original.tsv``. For example, if you specify a Bioproject, it fetches the information for all runs of this project. 2. Atlas filters the runs to contain only valid metagenome sequences. E.g. exclude singleton reads, 16S. The output will be saved in ``RunInfo.tsv`` - 3. Sometimes the same Sample is sequenced on different laines, which will result into multipe runs from the same sample. Atlas will **merge** runs from the same biosample. + 3. Sometimes the same Sample is sequenced on different lanes, which will result into multiple runs from the same sample. Atlas will **merge** runs from the same biosample. 4. Prepare a sample table and a config.yaml similar to the ``atlas init`` command. @@ -196,10 +196,10 @@ Limitations: For now atlas, cannot handle a mixture of paired and single end rea If you have longreads for your project, you would need to specify them yourself in the sample.tsv. During the run, the reads are downloaded from SRA in the likely most efficient way using prefetch and parallel, fastq.gz generation. -The download step has checkpoints, so if the pipline gets interupted, you can restart where you left off. -Using the comand line arguments ``--restart-times 3 and --keep-going`` You can even ask atlas to do multiple restarts befor stoping. +The download step has checkpoints, so if the pipeline gets interrupted, you can restart where you left off. +Using the command line arguments ``--restart-times 3 and --keep-going`` You can even ask atlas to do multiple restarts before stopping. -The downloaded reads, are directly processed. If you however want only to doenload the reads you can use.:: +The downloaded reads are directly processed. However, if you only want to download the reads you can use:: atlas run None download_sra @@ -247,7 +247,7 @@ We recommend to use atlas on a :ref:`cluster` system, which can be set up in a v Usage: atlas run [OPTIONS] [qc|assembly|binning|genomes|genecatalog|None|all] [SNAKEMAKE_ARGS]... - Runs the ATLAS pipline + Runs the ATLAS pipeline By default all steps are executed but a sub-workflow can be specified. Needs a config-file and expects to find a sample table in the working- @@ -262,7 +262,7 @@ We recommend to use atlas on a :ref:`cluster` system, which can be set up in a v -w, --working-dir PATH location to run atlas. -c, --config-file PATH config-file generated with 'atlas init' -j, --jobs INTEGER use at most this many jobs in parallel (see cluster - submission for mor details). + submission for more details). --profile TEXT snakemake profile e.g. for cluster execution. -n, --dryrun Test execution. [default: False] @@ -282,7 +282,7 @@ Automatic submitting to cluster systems --------------------------------------- Thanks to the underlying snakemake Atlas can submit parts of the pipeline automatically to a cluster system and define the appropriate resources. If one job has finished it launches the next one. -This allows you use the full capacity of your cluster system. You even need to pay attention not to spam the other users of the cluster. +This allows to use the full capacity of your cluster system. You even need to pay attention not to spam the other users of the cluster. @@ -303,7 +303,7 @@ Then run:: cookiecutter --output-dir ~/.config/snakemake https://github.com/metagenome-atlas/clusterprofile.git -This opens a interactive shell dialog and ask you for the name of the profile and your cluster system. +This opens an interactive shell dialog and ask you for the name of the profile and your cluster system. We recommend you keep the default name ``cluster``. The profile was tested on ``slurm``, ``lsf`` and ``pbs``. The resources (threads, memory and time) are defined in the atlas config file (hours and GB). @@ -352,11 +352,11 @@ The atlas argument ``--jobs`` now becomes the number of jobs simultaneously subm Single machine execution ======================== -If you dont want to use the :ref:`automatic scheduling ` you can use atlas on a single machine (local execution) with a lot of memory and threads ideally. In this case I recommend you the following options. The same applies if you submit a single job to a cluster running atlas. +If you don't want to use the :ref:`automatic scheduling ` you can use atlas on a single machine (local execution) with a lot of memory and threads ideally. In this case I recommend you the following options. The same applies if you submit a single job to a cluster running atlas. -Atlas detects how many CPUs and how much memory is available on your system and it will schedule as many jobs in paralell as possible. If you have less resources available than specified in the config file, the jobs are downscaled. +Atlas detects how many CPUs and how much memory is available on your system and it will schedule as many jobs in parallel as possible. If you have less resources available than specified in the config file, the jobs are downscaled. -By default atlas will use all cpus and 95% of all the available memory. If you are not happy with that, or you need to specify an exact ammount of memory/ cpus you can use the comand line arguments ``--jobs`` and ``--max-mem`` to do so. +By default atlas will use all cpus and 95% of all the available memory. If you are not happy with that, or you need to specify an exact amount of memory/ cpus you can use the command line arguments ``--jobs`` and ``--max-mem`` to do so. Cloud execution diff --git a/workflow/report/template_QC_report.html b/workflow/report/template_QC_report.html index 97a9ebf0..2bd2bb2b 100644 --- a/workflow/report/template_QC_report.html +++ b/workflow/report/template_QC_report.html @@ -19,7 +19,7 @@

Quality Control Report

-

Number of reads troughout th quality control process

+

Number of reads that went through the quality control process.

{div[Reads]} @@ -50,7 +50,7 @@

Number of reads troughout th quality control process

-

Total number of reads/bases ater the QC +

Total number of reads/bases after QC

@@ -65,17 +65,17 @@

Total number of reads/bases ater the QC

-

Quality values along the read +

Base quality values along reads

{div[quality_QC]} -

Read length +

Read length

{div[Length]} -

Insert size -

The size of the reads + the space between. Ideally the paired-end reads don't overlap.

+

Insert size

+

The size of the reads + the space between. Ideally, the paired-end reads don't overlap.

{div[Insert]} diff --git a/workflow/report/template_assembly_report.html b/workflow/report/template_assembly_report.html index 215a960d..d3960bab 100644 --- a/workflow/report/template_assembly_report.html +++ b/workflow/report/template_assembly_report.html @@ -24,11 +24,11 @@

Total assembly length

{div[Total]} -

Fragmentation +

Fragmentation

N50/N90 is a measure of how fractionated assemblies are: - 50%/ 90% of the assembly is made up of contigs of Length N50/N90-length or longer. + 50%/90% of the assembly consists of contigs of length N50/N90 or longer. You need N50/N90-number contigs to get 50%/90% of the total assembly length.

diff --git a/workflow/report/template_bin_report.html b/workflow/report/template_bin_report.html index 800222ab..eed781d3 100644 --- a/workflow/report/template_bin_report.html +++ b/workflow/report/template_bin_report.html @@ -23,21 +23,21 @@

Bin Report for Binner {binner}

{div[QualityScore]}

For all the information see the file {div[input_file]}

-

Number of genomes +

Number of genomes

{div[table]}

"Good quality" refers to the standard of Completeness > 90% and Contamination < 5%. Also called high-quality or near-complete. But t-RNA/r-RNA presence is not evaluated. It is less stingent than Quality Score > 90.

-

Quality for all bins +

Quality for all bins

{div[2D]} -

Quality for Species representatives +

Quality for Species representatives

{div[2Dsp]} -

Quality score by Sample +

Quality score by Sample

@@ -49,4 +49,4 @@

Quality score by Sample - \ No newline at end of file + diff --git a/workflow/rules/binning.smk b/workflow/rules/binning.smk index 233d2da7..ffb6bd62 100644 --- a/workflow/rules/binning.smk +++ b/workflow/rules/binning.smk @@ -273,7 +273,7 @@ rule get_unique_cluster_attribution: if new_d.shape[0] == 0: logger.warning( f"No bins detected with binner {wildcards.binner} in sample {wildcards.sample}.\n" - "I add longest contig to make the pipline continue" + "I add longest contig to make the pipeline continue" ) new_d[f"{wildcards.sample}_0"] = "{sample}_{binner}_1".format(**wildcards) diff --git a/workflow/scripts/parse_vamb.py b/workflow/scripts/parse_vamb.py index c0d39360..01f7cdb8 100644 --- a/workflow/scripts/parse_vamb.py +++ b/workflow/scripts/parse_vamb.py @@ -137,7 +137,7 @@ def handle_exception(exc_type, exc_value, exc_traceback): if len(samples_without_bins) > 0: logging.warning( - "The following samples did't yield bins, I add longest contig to make the pipline continue:\n" + "The following samples didn't yield bins, I add longest contig to make the pipeline continue:\n" + "\n".join(samples_without_bins) ) diff --git a/workflow/scripts/utils/io.py b/workflow/scripts/utils/io.py index aa20d621..57640a99 100644 --- a/workflow/scripts/utils/io.py +++ b/workflow/scripts/utils/io.py @@ -135,7 +135,7 @@ def _pandas_concat_disck_based( selected_headers.update(list(headers_of_file.columns)) selected_headers = list(selected_headers) - logger.info(f"Infered folowing list of headers {selected_headers}") + logger.info(f"Inferred following list of headers {selected_headers}") # parse one file after another From df691ac6347870eb651df9b3d5dbbed0391634ab Mon Sep 17 00:00:00 2001 From: Silvio Waschina Date: Fri, 12 Jan 2024 18:25:30 +0100 Subject: [PATCH 02/37] Back-change heading closing tags - https://github.com/metagenome-atlas/atlas/pull/713#issuecomment-1889512838 --- workflow/report/template_QC_report.html | 8 ++++---- workflow/report/template_assembly_report.html | 2 +- workflow/report/template_bin_report.html | 8 ++++---- 3 files changed, 9 insertions(+), 9 deletions(-) diff --git a/workflow/report/template_QC_report.html b/workflow/report/template_QC_report.html index 2bd2bb2b..e53f6222 100644 --- a/workflow/report/template_QC_report.html +++ b/workflow/report/template_QC_report.html @@ -50,7 +50,7 @@

Number of reads that went through the quality control process.

-

Total number of reads/bases after QC

+

Total number of reads/bases after QC
@@ -65,16 +65,16 @@

Total number of reads/bases after QC

-

Base quality values along reads

+

Base quality values along reads {div[quality_QC]} -

Read length

+

Read length {div[Length]} -

Insert size

+

Insert size

The size of the reads + the space between. Ideally, the paired-end reads don't overlap.

{div[Insert]} diff --git a/workflow/report/template_assembly_report.html b/workflow/report/template_assembly_report.html index d3960bab..ed15bae6 100644 --- a/workflow/report/template_assembly_report.html +++ b/workflow/report/template_assembly_report.html @@ -24,7 +24,7 @@

Total assembly length

{div[Total]} -

Fragmentation

+

Fragmentation

N50/N90 is a measure of how fractionated assemblies are: diff --git a/workflow/report/template_bin_report.html b/workflow/report/template_bin_report.html index eed781d3..f2417961 100644 --- a/workflow/report/template_bin_report.html +++ b/workflow/report/template_bin_report.html @@ -23,21 +23,21 @@

Bin Report for Binner {binner}

{div[QualityScore]}

For all the information see the file {div[input_file]}

-

Number of genomes

+

Number of genomes {div[table]}

"Good quality" refers to the standard of Completeness > 90% and Contamination < 5%. Also called high-quality or near-complete. But t-RNA/r-RNA presence is not evaluated. It is less stingent than Quality Score > 90.

-

Quality for all bins

+

Quality for all bins {div[2D]} -

Quality for Species representatives

+

Quality for Species representatives {div[2Dsp]} -

Quality score by Sample

+

Quality score by Sample From 54cdbb35980eba4d4dd2dbe1463948f710c2139b Mon Sep 17 00:00:00 2001 From: silask Date: Mon, 22 Jan 2024 14:22:28 +0100 Subject: [PATCH 03/37] correct h2 tab --- docs/reports/QC_report.html | 8 ++++---- docs/reports/assembly_report.html | 2 +- docs/reports/bin_report_DASTool.html | 8 ++++---- docs/reports/bin_report_SemiBin.html | 8 ++++---- docs/reports/bin_report_vamb.html | 8 ++++---- workflow/report/template_QC_report.html | 8 ++++---- workflow/report/template_assembly_report.html | 2 +- workflow/report/template_bin_report.html | 8 ++++---- 8 files changed, 26 insertions(+), 26 deletions(-) diff --git a/docs/reports/QC_report.html b/docs/reports/QC_report.html index ea2e14a0..a95ed956 100644 --- a/docs/reports/QC_report.html +++ b/docs/reports/QC_report.html @@ -92,7 +92,7 @@

Number of reads troughout th quality control process

-

Total number of reads/bases ater the QC +

Total number of reads/bases ater the QC

@@ -107,16 +107,16 @@

Total number of reads/bases ater the QC

-

Quality values along the read +

Quality values along the read

-

Read length +

Read length

-

Insert size +

Insert size

The size of the reads + the space between. Ideally the paired-end reads don't overlap.

diff --git a/docs/reports/assembly_report.html b/docs/reports/assembly_report.html index 33cb12a7..b9a60466 100644 --- a/docs/reports/assembly_report.html +++ b/docs/reports/assembly_report.html @@ -66,7 +66,7 @@

Total assembly length

-

Fragmentation +

Fragmentation

N50/N90 is a measure of how fractionated assemblies are: diff --git a/docs/reports/bin_report_DASTool.html b/docs/reports/bin_report_DASTool.html index a45aa818..e74a54a6 100644 --- a/docs/reports/bin_report_DASTool.html +++ b/docs/reports/bin_report_DASTool.html @@ -65,7 +65,7 @@

Bin Report for Binner DASTool

Quality score is calculated as: Completeness - 5 x Contamination.

For all the information see the file Binning/DASTool/bin_info.tsv and Binning/DASTool/bins2species.tsv

-

Number of genomes +

Number of genomes

@@ -100,16 +100,16 @@

Number of genomes

"Good quality" refers to the standard of Completeness > 90% and Contamination < 5%. Also called high-quality or near-complete. But t-RNA/r-RNA presence is not evaluated. It is less stingent than Quality Score > 90.

-

Quality for all bins +

Quality for all bins

-

Quality for Species representatives +

Quality for Species representatives

-

Quality score by Sample +

Quality score by Sample

diff --git a/docs/reports/bin_report_SemiBin.html b/docs/reports/bin_report_SemiBin.html index f742cd89..064959fa 100644 --- a/docs/reports/bin_report_SemiBin.html +++ b/docs/reports/bin_report_SemiBin.html @@ -65,7 +65,7 @@

Bin Report for Binner SemiBin

Quality score is calculated as: Completeness - 5 x Contamination.

For all the information see the file Binning/SemiBin/bin_info.tsv and Binning/SemiBin/bins2species.tsv

-

Number of genomes +

Number of genomes

@@ -100,16 +100,16 @@

Number of genomes

"Good quality" refers to the standard of Completeness > 90% and Contamination < 5%. Also called high-quality or near-complete. But t-RNA/r-RNA presence is not evaluated. It is less stingent than Quality Score > 90.

-

Quality for all bins +

Quality for all bins

-

Quality for Species representatives +

Quality for Species representatives

-

Quality score by Sample +

Quality score by Sample

diff --git a/docs/reports/bin_report_vamb.html b/docs/reports/bin_report_vamb.html index 16c31e1c..6357e9cd 100644 --- a/docs/reports/bin_report_vamb.html +++ b/docs/reports/bin_report_vamb.html @@ -65,7 +65,7 @@

Bin Report for Binner vamb

Quality score is calculated as: Completeness - 5 x Contamination.

For all the information see the file Binning/vamb/bin_info.tsv and Binning/vamb/bins2species.tsv

-

Number of genomes +

Number of genomes

@@ -100,16 +100,16 @@

Number of genomes

"Good quality" refers to the standard of Completeness > 90% and Contamination < 5%. Also called high-quality or near-complete. But t-RNA/r-RNA presence is not evaluated. It is less stingent than Quality Score > 90.

-

Quality for all bins +

Quality for all bins

-

Quality for Species representatives +

Quality for Species representatives

-

Quality score by Sample +

Quality score by Sample

diff --git a/workflow/report/template_QC_report.html b/workflow/report/template_QC_report.html index e53f6222..2bd2bb2b 100644 --- a/workflow/report/template_QC_report.html +++ b/workflow/report/template_QC_report.html @@ -50,7 +50,7 @@

Number of reads that went through the quality control process.

-

Total number of reads/bases after QC +

Total number of reads/bases after QC

@@ -65,16 +65,16 @@

Total number of reads/bases after QC

-

Base quality values along reads +

Base quality values along reads

{div[quality_QC]} -

Read length +

Read length

{div[Length]} -

Insert size +

Insert size

The size of the reads + the space between. Ideally, the paired-end reads don't overlap.

{div[Insert]} diff --git a/workflow/report/template_assembly_report.html b/workflow/report/template_assembly_report.html index ed15bae6..d3960bab 100644 --- a/workflow/report/template_assembly_report.html +++ b/workflow/report/template_assembly_report.html @@ -24,7 +24,7 @@

Total assembly length

{div[Total]} -

Fragmentation +

Fragmentation

N50/N90 is a measure of how fractionated assemblies are: diff --git a/workflow/report/template_bin_report.html b/workflow/report/template_bin_report.html index f2417961..eed781d3 100644 --- a/workflow/report/template_bin_report.html +++ b/workflow/report/template_bin_report.html @@ -23,21 +23,21 @@

Bin Report for Binner {binner}

{div[QualityScore]}

For all the information see the file {div[input_file]}

-

Number of genomes +

Number of genomes

{div[table]}

"Good quality" refers to the standard of Completeness > 90% and Contamination < 5%. Also called high-quality or near-complete. But t-RNA/r-RNA presence is not evaluated. It is less stingent than Quality Score > 90.

-

Quality for all bins +

Quality for all bins

{div[2D]} -

Quality for Species representatives +

Quality for Species representatives

{div[2Dsp]} -

Quality score by Sample +

Quality score by Sample

From 30b32b5bae3e353cf9b0d862bc5715240401c3d1 Mon Sep 17 00:00:00 2001 From: Niklaus Johner Date: Thu, 15 Feb 2024 10:46:43 +0100 Subject: [PATCH 04/37] Update multiqc wrapper to newest version. v1.19.1 is not compatible with Python 3.12. --- workflow/rules/genomes.smk | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/workflow/rules/genomes.smk b/workflow/rules/genomes.smk index ea53d19d..bc2dfdf0 100644 --- a/workflow/rules/genomes.smk +++ b/workflow/rules/genomes.smk @@ -310,7 +310,7 @@ rule multiqc_mapping_genome: log: "logs/genomes/alignment/multiqc.log", wrapper: - "v1.19.1/bio/multiqc" + "v3.3.6/bio/multiqc" rule pileup_MAGs: From cbff85689487af6e0eca3ea9460fbc8465ffdafc Mon Sep 17 00:00:00 2001 From: Silas Kieser Date: Mon, 29 Apr 2024 11:08:55 +0200 Subject: [PATCH 05/37] path to config file --- docs/usage/configuration.rst | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/docs/usage/configuration.rst b/docs/usage/configuration.rst index 0dc88b10..4864fc6a 100644 --- a/docs/usage/configuration.rst +++ b/docs/usage/configuration.rst @@ -101,8 +101,7 @@ Example config file =================== -.. -include:: ../../workflow/../config/template_config.yaml +..include:: ../../config/template_config.yaml :code: From bfce186dd46cfe62c72b31401def86da6a793413 Mon Sep 17 00:00:00 2001 From: Silas Kieser Date: Tue, 4 Jun 2024 21:09:29 +0200 Subject: [PATCH 06/37] seperate log file does not work --- workflow/rules/qc.smk | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/workflow/rules/qc.smk b/workflow/rules/qc.smk index 0dcb7401..b6b7e541 100644 --- a/workflow/rules/qc.smk +++ b/workflow/rules/qc.smk @@ -158,13 +158,12 @@ if not SKIP_QC: dupesubs=config["duplicates_allow_substitutions"], only_optical=("t" if config.get("duplicates_only_optical") else "f"), log: - sterr="{sample}/logs/QC/deduplicate.err", - stout="{sample}/logs/QC/deduplicate.log", + "{sample}/logs/QC/deduplicate.log", conda: "%s/required_packages.yaml" % CONDAENV threads: config.get("threads", 1) resources: - mem=config["mem"], + mem_mb=config["mem"]*1024, java_mem=int(config["mem"] * JAVA_MEM_FRACTION), shell: "clumpify.sh " @@ -177,8 +176,7 @@ if not SKIP_QC: " threads={threads} " " pigz=t unpigz=t " " -Xmx{resources.java_mem}G " - " 2> {log.sterr} " - " 1> {log.stout} " + " &> {log} " PROCESSED_STEPS.append("filtered") From f65f1f824ab1b5bee17f8e14bb21aded3dfffc3b Mon Sep 17 00:00:00 2001 From: Silas Kieser Date: Sun, 23 Jun 2024 16:56:21 +0200 Subject: [PATCH 07/37] correct import of load_configfile --- atlas/atlas.py | 2 +- atlas/make_config.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/atlas/atlas.py b/atlas/atlas.py index 330d1487..265f2665 100644 --- a/atlas/atlas.py +++ b/atlas/atlas.py @@ -7,7 +7,7 @@ import click -from snakemake.io import load_configfile +from snakemake.common.configfile import load_configfile from .make_config import validate_config from .init.atlas_init import run_init # , run_init_sra diff --git a/atlas/make_config.py b/atlas/make_config.py index 96332497..3f21dcb8 100644 --- a/atlas/make_config.py +++ b/atlas/make_config.py @@ -1,6 +1,6 @@ from .default_values import * from snakemake.utils import update_config as snakemake_update_config -from snakemake.io import load_configfile +from snakemake.common.configfile import load_configfile import tempfile import sys import os From eed96574febac2dfd8fc636df02356ce2b333dca Mon Sep 17 00:00:00 2001 From: Kieser Silas Date: Sun, 23 Jun 2024 20:45:37 +0200 Subject: [PATCH 08/37] add slurm plugin --- atlasenv.yml | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/atlasenv.yml b/atlasenv.yml index c1ec80ea..20c65ac1 100644 --- a/atlasenv.yml +++ b/atlasenv.yml @@ -3,10 +3,10 @@ channels: - bioconda - defaults dependencies: - - python >=3.8, < 3.12 + - python >=3.10, < 3.12 - mamba - bbmap >= 39.01, <40 - - snakemake-minimal >= 7.18.1, <7.26 + - snakemake-minimal >= 8.12, <8.15 - pygments - networkx - graphviz @@ -16,3 +16,4 @@ dependencies: - ruamel.yaml >=0.17 - cookiecutter - wget + - snakemake-executor-plugin-slurm From 6d3ec545e2dc19b7bc7c89e1c08774f896550e78 Mon Sep 17 00:00:00 2001 From: Kieser Silas Date: Sun, 23 Jun 2024 20:47:02 +0200 Subject: [PATCH 09/37] spades 4 --- workflow/envs/spades.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/workflow/envs/spades.yaml b/workflow/envs/spades.yaml index c660afab..be85bd0e 100644 --- a/workflow/envs/spades.yaml +++ b/workflow/envs/spades.yaml @@ -3,4 +3,4 @@ channels: - bioconda - defaults dependencies: - - spades>=3.15.3 + - spades>=4.0 From e84d76a932effb66003fd4aae6cc05342f874d78 Mon Sep 17 00:00:00 2001 From: Kieser Silas Date: Sun, 23 Jun 2024 20:47:39 +0200 Subject: [PATCH 10/37] error in mem definition --- workflow/rules/qc.smk | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/workflow/rules/qc.smk b/workflow/rules/qc.smk index b6b7e541..a96da0a3 100644 --- a/workflow/rules/qc.smk +++ b/workflow/rules/qc.smk @@ -227,13 +227,12 @@ if not SKIP_QC: output.reads, key="out", allow_singletons=False ), log: - sterr="{sample}/logs/QC/quality_filter.err", - stout="{sample}/logs/QC/quality_filter.log", + "{sample}/logs/QC/quality_filter.log", conda: "%s/required_packages.yaml" % CONDAENV threads: config.get("threads", 1) resources: - mem=config["mem"], + mem_mb=config["mem"]*1024, java_mem=int(config["mem"] * JAVA_MEM_FRACTION), shell: " bbduk.sh {params.inputs} " @@ -258,8 +257,7 @@ if not SKIP_QC: " prealloc={params.prealloc} " " pigz=t unpigz=t " " -Xmx{resources.java_mem}G " - " 2> {log.sterr} " - " 1> {log.stout} " + " &> {log} " # if there are no references, decontamination will be skipped if len(config.get("contaminant_references", {}).keys()) > 0: @@ -463,7 +461,7 @@ else: kmer=config["merging_k"], threads: config["simplejob_threads"] resources: - mem=config["simplejob_mem"], + mem_mb = config["simplejob_mem"]*1024, conda: "../envs/required_packages.yaml" log: From 330df57e59109853e8c508be2fa1ef77ea37f984 Mon Sep 17 00:00:00 2001 From: Silas Kieser Date: Tue, 25 Jun 2024 10:43:25 +0200 Subject: [PATCH 11/37] fix-spades-mem_mb --- workflow/rules/assemble.smk | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/workflow/rules/assemble.smk b/workflow/rules/assemble.smk index e189f3ec..cb5b2ff3 100644 --- a/workflow/rules/assemble.smk +++ b/workflow/rules/assemble.smk @@ -432,6 +432,7 @@ else: threads: config["assembly_threads"] resources: mem_mb=config["assembly_memory"] * 1000, + mem_gb= config["assembly_memory"], time_min=60 * config["runtime"]["assembly"], shell: # remove pipeline_state file to create all output files again @@ -439,7 +440,7 @@ else: " " "spades.py " " --threads {threads} " - " --memory {resources.mem} " + " --memory {resources.mem_gb} " " -o {params.p[outdir]} " " -k {params.k}" " {params.p[preset]} " From fe9cdc89553e33bd1daf84ec503f02eaff53af6b Mon Sep 17 00:00:00 2001 From: Silas Kieser Date: Tue, 25 Jun 2024 10:43:38 +0200 Subject: [PATCH 12/37] path in bin report --- workflow/rules/derep.smk | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/workflow/rules/derep.smk b/workflow/rules/derep.smk index 63312277..9e2c78c0 100644 --- a/workflow/rules/derep.smk +++ b/workflow/rules/derep.smk @@ -110,4 +110,4 @@ rule build_bin_report: log: "logs/binning/report_{binner}.log", script: - "../report/bin_report.py" + "../../report/bin_report.py" From df1e8bb399d7addf4c133bd8ea1ac1d4f8789033 Mon Sep 17 00:00:00 2001 From: Silas Kieser Date: Tue, 25 Jun 2024 12:04:11 +0200 Subject: [PATCH 13/37] fix bin report path again --- workflow/rules/derep.smk | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/workflow/rules/derep.smk b/workflow/rules/derep.smk index 9e2c78c0..63312277 100644 --- a/workflow/rules/derep.smk +++ b/workflow/rules/derep.smk @@ -110,4 +110,4 @@ rule build_bin_report: log: "logs/binning/report_{binner}.log", script: - "../../report/bin_report.py" + "../report/bin_report.py" From 74a6a52e38972f8f2d2b28029586bdb187063851 Mon Sep 17 00:00:00 2001 From: Silas Kieser Date: Tue, 25 Jun 2024 12:04:56 +0200 Subject: [PATCH 14/37] update verstion and paths --- workflow/envs/gtdbtk.yaml | 2 +- workflow/rules/download.smk | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/workflow/envs/gtdbtk.yaml b/workflow/envs/gtdbtk.yaml index 5a8f5c00..600ccca0 100644 --- a/workflow/envs/gtdbtk.yaml +++ b/workflow/envs/gtdbtk.yaml @@ -3,4 +3,4 @@ channels: - bioconda - defaults dependencies: -- gtdbtk =2.3 +- gtdbtk =2.4 diff --git a/workflow/rules/download.smk b/workflow/rules/download.smk index 7e476cde..197e5235 100644 --- a/workflow/rules/download.smk +++ b/workflow/rules/download.smk @@ -17,8 +17,8 @@ EGGNOG_DIR = os.path.join(DBDIR, "EggNOG_V" + EGGNOG_VERSION) CONDAENV = "../envs" -GTDB_VERSION = "V08_R214" -GTDB_DATA_URL = "https://data.gtdb.ecogenomic.org/releases/release214/214.0/auxillary_files/gtdbtk_r214_data.tar.gz" +GTDB_VERSION = "V09_R200" +GTDB_DATA_URL = "https://data.gtdb.ecogenomic.org/releases/release220/220.0/auxillary_files/gtdbtk_package/full_package/gtdbtk_r220_data.tar.gz" GTDBTK_DATA_PATH = os.path.join(DBDIR, "GTDB_" + GTDB_VERSION) From 1eddc340cd629340b7b354f15957a9d2f7bb0243 Mon Sep 17 00:00:00 2001 From: Silas Kieser Date: Tue, 25 Jun 2024 23:24:04 +0200 Subject: [PATCH 15/37] use split download from gtdbtk --- workflow/rules/download.smk | 193 ++++++++++++------------------------ 1 file changed, 63 insertions(+), 130 deletions(-) diff --git a/workflow/rules/download.smk b/workflow/rules/download.smk index 197e5235..c4b0ae56 100644 --- a/workflow/rules/download.smk +++ b/workflow/rules/download.smk @@ -1,25 +1,66 @@ import hashlib import os - +from pathlib import Path # this values are incuded in the snakefile -DBDIR = os.path.realpath(config["database_dir"]) -CHECKMDIR = os.path.join(DBDIR, "checkm") -CHECKM_ARCHIVE = "checkm_data_v1.0.9.tar.gz" -CAT_DIR = os.path.join(DBDIR, "CAT") -CAT_flag_downloaded = os.path.join(CAT_DIR, "downloaded") -GUNCDIR = os.path.join(DBDIR, "gunc_database") -BUSCODIR = os.path.join(DBDIR, "busco_lineages") +DBDIR = Path(config["database_dir"]).resolve() + +GUNCDIR = DBDIR/ "gunc_database" +BUSCODIR = DBDIR/ "busco_lineages" ZENODO_ARCHIVE = "1134890" EGGNOG_VERSION = "5" -EGGNOG_DIR = os.path.join(DBDIR, "EggNOG_V" + EGGNOG_VERSION) +EGGNOG_DIR = DBDIR/ ("EggNOG_V" + EGGNOG_VERSION) CONDAENV = "../envs" + +## GTDBTk + GTDB_VERSION = "V09_R200" -GTDB_DATA_URL = "https://data.gtdb.ecogenomic.org/releases/release220/220.0/auxillary_files/gtdbtk_package/full_package/gtdbtk_r220_data.tar.gz" -GTDBTK_DATA_PATH = os.path.join(DBDIR, "GTDB_" + GTDB_VERSION) +GTDB_DATA_URL = Path("https://data.gtdb.ecogenomic.org/releases/release220/220.0/auxillary_files/gtdbtk_package") +GTDBTK_DATA_PATH = DBDIR/ ("GTDB_" + GTDB_VERSION) + + +def all_partial_gtdb_tarbals(wildcards,GTDB_REFSEQ_VERSION=220,GTDB_PATIAL_SUFFIXES=["a"+i for i in "abcdefghijk"]): + + return expand(GTDBTK_DATA_PATH/"gtdbtk_r{gtdb_refseq_version}_data.tar.gz.part_{suffix}", + gtdb_refseq_version= GTDB_REFSEQ_VERSION, + suffix=GTDB_PATIAL_SUFFIXES) + + +localrules: + download_partial_gtdb, extract_gtdb + + +rule download_partial_gtdb: + output: + temp(GTDBTK_DATA_PATH/"gtdbtk_r{gtdb_refseq_version}_data.tar.gz.part_{suffix}"), + threads: 1 + params: + url = lambda wc,output: GTDB_DATA_URL/"split_package"/ Path(output[0]).name + resources: + time_min=60 * int(config.get("runtime", {"long": 10})["long"]), + log: + "logs/download/gtdbtk_r{gtdb_refseq_version}_part_{suffix}.log", + shell: + " wget --no-check-certificate {params.url} -O {output} &> {log} " + + +rule extract_gtdb: + input: + all_partial_gtdb_tarbals + output: + touch(os.path.join(GTDBTK_DATA_PATH, "downloaded_success")), + threads: 1 + resources: + time_min=60 * int(config.get("runtime", {"long": 10})["long"]), + log: + "logs/download/gtdbtk_untar.log", + shell: + '( cat {input} | tar -xzvf - -C "{GTDBTK_DATA_PATH}" --strip 1 ) 2> {log} ' + +### end GTDBTk def md5(fname): @@ -37,50 +78,11 @@ def md5(fname): FILES = { "adapters.fa": "ae839dc79cfb855a1b750a0d593fe01e", "phiX174_virus.fa": "82516880142e8c89b466bc6118696c47", - "refseq.db": "42b8976656f2cfd661b8a299d6e24c19", - "refseq.dmnd": "c01facc7e397270ccb796ea799a09108", - "refseq.tree": "469fcbeb15dd0d4bf8f1677682bde157", "silva_rfam_all_rRNAs.fa": "f102e35d9f48eabeb0efe9058559bc66", - "eggnog.db": "7923d3bb7eca8e0e8f122be4b5ca6997", - "eggnog_proteins.dmnd": "64fefa838833a6f3e220a06fb9d403cd", - CHECKM_ARCHIVE: "631012fa598c43fdeb88c619ad282c4d", + } -CHECKMFILES = [ - "%s/taxon_marker_sets.tsv" % CHECKMDIR, - "%s/selected_marker_sets.tsv" % CHECKMDIR, - "%s/pfam/tigrfam2pfam.tsv" % CHECKMDIR, - "%s/pfam/Pfam-A.hmm.dat" % CHECKMDIR, - "%s/img/img_metadata.tsv" % CHECKMDIR, - "%s/hmms_ssu/SSU_euk.hmm" % CHECKMDIR, - "%s/hmms_ssu/SSU_bacteria.hmm" % CHECKMDIR, - "%s/hmms_ssu/SSU_archaea.hmm" % CHECKMDIR, - "%s/hmms_ssu/createHMMs.py" % CHECKMDIR, - "%s/hmms/phylo.hmm.ssi" % CHECKMDIR, - "%s/hmms/phylo.hmm" % CHECKMDIR, - "%s/hmms/checkm.hmm.ssi" % CHECKMDIR, - "%s/hmms/checkm.hmm" % CHECKMDIR, - "%s/genome_tree/missing_duplicate_genes_97.tsv" % CHECKMDIR, - "%s/genome_tree/missing_duplicate_genes_50.tsv" % CHECKMDIR, - "%s/genome_tree/genome_tree.taxonomy.tsv" % CHECKMDIR, - "%s/genome_tree/genome_tree_reduced.refpkg/phylo_modelJqWx6_.json" % CHECKMDIR, - "%s/genome_tree/genome_tree_reduced.refpkg/genome_tree.tre" % CHECKMDIR, - "%s/genome_tree/genome_tree_reduced.refpkg/genome_tree.log" % CHECKMDIR, - "%s/genome_tree/genome_tree_reduced.refpkg/genome_tree.fasta" % CHECKMDIR, - "%s/genome_tree/genome_tree_reduced.refpkg/CONTENTS.json" % CHECKMDIR, - "%s/genome_tree/genome_tree.metadata.tsv" % CHECKMDIR, - "%s/genome_tree/genome_tree_full.refpkg/phylo_modelEcOyPk.json" % CHECKMDIR, - "%s/genome_tree/genome_tree_full.refpkg/genome_tree.tre" % CHECKMDIR, - "%s/genome_tree/genome_tree_full.refpkg/genome_tree.log" % CHECKMDIR, - "%s/genome_tree/genome_tree_full.refpkg/genome_tree.fasta" % CHECKMDIR, - "%s/genome_tree/genome_tree_full.refpkg/CONTENTS.json" % CHECKMDIR, - "%s/genome_tree/genome_tree.derep.txt" % CHECKMDIR, - "%s/.dmanifest" % CHECKMDIR, - "%s/distributions/td_dist.txt" % CHECKMDIR, - "%s/distributions/gc_dist.txt" % CHECKMDIR, - "%s/distributions/cd_dist.txt" % CHECKMDIR, -] def get_eggnog_db_file(): @@ -97,7 +99,6 @@ localrules: download, download_eggNOG_files, download_atlas_files, - download_checkm_data, download_gunc, @@ -111,7 +112,7 @@ rule download: ), get_eggnog_db_file(), f"{DBDIR}/CheckM2", - os.path.join(GTDBTK_DATA_PATH, "downloaded_success"), + GTDBTK_DATA_PATH/ "downloaded_success" rule download_eggNOG_files: @@ -139,73 +140,7 @@ rule download_atlas_files: raise OSError(2, "Invalid checksum", output[0]) -rule download_checkm_data: - output: - tar=temp(CHECKM_ARCHIVE), - files=CHECKMFILES, - params: - path=CHECKMDIR, - run: - shell( - "wget -O {output.tar} 'https://zenodo.org/record/{ZENODO_ARCHIVE}/files/{CHECKM_ARCHIVE}' " - ) - if not FILES[CHECKM_ARCHIVE] == md5(output.tar): - raise OSError(2, "Invalid checksum", CHECKM_ARCHIVE) - - shell("tar -zxf {output.tar} --directory {params.path}") - - -localrules: - initialize_checkm, - - -rule initialize_checkm: - input: - ancient(CHECKMFILES), - output: - touched_output=touch("logs/checkm_init.txt"), - params: - database_dir=CHECKMDIR, - conda: - "%s/checkm.yaml" % CONDAENV - log: - "logs/initialize_checkm.log", - shell: - "checkm data setRoot {params.database_dir} &> {log} " - -localrules: - download_gtdb, - - -rule download_gtdb: - output: - temp(f"{GTDBTK_DATA_PATH}/gtdb_data.tar.gz"), - conda: - "../envs/gtdbtk.yaml" - threads: 1 - resources: - time_min=60 * int(config.get("runtime", {"long": 10})["long"]), - log: - "logs/download/gtdbtk.log", - shell: - " wget --no-check-certificate {GTDB_DATA_URL} -O {output} &> {log} " - - -rule extract_gtdb: - input: - rules.download_gtdb.output, - output: - touch(os.path.join(GTDBTK_DATA_PATH, "downloaded_success")), - conda: - "../envs/gtdbtk.yaml" - threads: 1 - resources: - time_min=60 * int(config.get("runtime", {"long": 10})["long"]), - log: - "logs/download/gtdbtk_untar.log", - shell: - 'tar -xzvf {input} -C "{GTDBTK_DATA_PATH}" --strip 1 2> {log}; ' rule checkm2_download_db: @@ -261,14 +196,12 @@ onsuccess: onerror: print("An error occurred while downloading reference databases.") - print( - "ATLAS databases can be manually downloaded from: https://zenodo.org/record/%s" - % ZENODO_ARCHIVE - ) - print( - "eggNOG databases can be manually downloaded from: http://eggnogdb.embl.de/download/emapperdb-%s" - % EGGNOG_VERSION - ) - print( - "CAT databases can be manually downloaded from: https://github.com/dutilh/CAT" - ) + # print( + # "ATLAS databases can be manually downloaded from: https://zenodo.org/record/%s" + # % ZENODO_ARCHIVE + # ) + # print( + # "eggNOG databases can be manually downloaded from: http://eggnogdb.embl.de/download/emapperdb-%s" + # % EGGNOG_VERSION + # ) + From 71ff4e2519b1120cf822e9697240a04d1437ffac Mon Sep 17 00:00:00 2001 From: Silas Kieser Date: Wed, 26 Jun 2024 10:02:11 +0200 Subject: [PATCH 16/37] url is not a path --- workflow/rules/download.smk | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/workflow/rules/download.smk b/workflow/rules/download.smk index c4b0ae56..bf74a2c4 100644 --- a/workflow/rules/download.smk +++ b/workflow/rules/download.smk @@ -18,7 +18,7 @@ CONDAENV = "../envs" ## GTDBTk GTDB_VERSION = "V09_R200" -GTDB_DATA_URL = Path("https://data.gtdb.ecogenomic.org/releases/release220/220.0/auxillary_files/gtdbtk_package") +GTDB_DATA_URL = "https://data.gtdb.ecogenomic.org/releases/release220/220.0/auxillary_files/gtdbtk_package" GTDBTK_DATA_PATH = DBDIR/ ("GTDB_" + GTDB_VERSION) @@ -38,7 +38,7 @@ rule download_partial_gtdb: temp(GTDBTK_DATA_PATH/"gtdbtk_r{gtdb_refseq_version}_data.tar.gz.part_{suffix}"), threads: 1 params: - url = lambda wc,output: GTDB_DATA_URL/"split_package"/ Path(output[0]).name + url = lambda wc,output: f"{GTDB_DATA_URL}/split_package/{ Path(output[0]).name}" resources: time_min=60 * int(config.get("runtime", {"long": 10})["long"]), log: From 367f1bfd593bb4a33d877a2c257fb4ffcc145ddc Mon Sep 17 00:00:00 2001 From: Silas Kieser Date: Wed, 26 Jun 2024 10:02:26 +0200 Subject: [PATCH 17/37] more memory for gtdbtk aling --- workflow/rules/gtdbtk.smk | 2 ++ 1 file changed, 2 insertions(+) diff --git a/workflow/rules/gtdbtk.smk b/workflow/rules/gtdbtk.smk index 02d630b0..456504b7 100644 --- a/workflow/rules/gtdbtk.smk +++ b/workflow/rules/gtdbtk.smk @@ -32,6 +32,8 @@ checkpoint align: output: directory(f"{gtdb_dir}/align"), threads: config["threads"] + resources: + mem_mb=config["large_mem"] * 1000, conda: "../envs/gtdbtk.yaml" log: From b7c73a58fa5dd2abf6ec5ea297f07369fa24a3cc Mon Sep 17 00:00:00 2001 From: Silas Kieser Date: Sat, 27 Jul 2024 11:47:05 +0200 Subject: [PATCH 18/37] sterr and stdout log for extract gtdb --- workflow/rules/download.smk | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/workflow/rules/download.smk b/workflow/rules/download.smk index bf74a2c4..85e56cd9 100644 --- a/workflow/rules/download.smk +++ b/workflow/rules/download.smk @@ -56,9 +56,10 @@ rule extract_gtdb: resources: time_min=60 * int(config.get("runtime", {"long": 10})["long"]), log: - "logs/download/gtdbtk_untar.log", + stdout="logs/download/gtdbtk_untar.log", + stderr="logs/download/gtdbtk_untar.err", shell: - '( cat {input} | tar -xzvf - -C "{GTDBTK_DATA_PATH}" --strip 1 ) 2> {log} ' + '( cat {input} | tar -xzvf - -C "{GTDBTK_DATA_PATH}" --strip 1 ) 2> {log.stderr} > {log.stdout} ' ### end GTDBTk From e905d3b9d90a7b8c011206987ea3958f51a8d21d Mon Sep 17 00:00:00 2001 From: Silas Kieser Date: Wed, 26 Jun 2024 10:14:42 +0200 Subject: [PATCH 19/37] my dram --- workflow/envs/dram.yaml | 23 +++++++++++++++++++++-- 1 file changed, 21 insertions(+), 2 deletions(-) diff --git a/workflow/envs/dram.yaml b/workflow/envs/dram.yaml index 820601af..6025b0a9 100644 --- a/workflow/envs/dram.yaml +++ b/workflow/envs/dram.yaml @@ -2,5 +2,24 @@ channels: - conda-forge - bioconda dependencies: - - dram >= 1.4.6, <1.5 #Shold define all correct verrsions See https://github.com/bioconda/bioconda-recipes/pull/41518 - + - python >=3.8 + - altair >=4 + - networkx + - numpy + - openpyxl + - pandas >=1.5, <2 + - scikit-bio >=0.5.8, <0.6 + - sqlalchemy + - prodigal + - scipy >=1.9 + - mmseqs2 >10.6d92c + - hmmer + - trnascan-se >=2 + - barrnap + - ruby + - parallel + - wget + - curl + - pip + - pip: + - git+https://github.com/SilasK/DRAM.git \ No newline at end of file From 510ac16d0f0fcb4323b0a458c79cd4ffa842c85e Mon Sep 17 00:00:00 2001 From: silask Date: Wed, 26 Jun 2024 10:29:23 +0200 Subject: [PATCH 20/37] formating --- workflow/rules/assemble.smk | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/workflow/rules/assemble.smk b/workflow/rules/assemble.smk index cb5b2ff3..2ac36e19 100644 --- a/workflow/rules/assemble.smk +++ b/workflow/rules/assemble.smk @@ -432,7 +432,7 @@ else: threads: config["assembly_threads"] resources: mem_mb=config["assembly_memory"] * 1000, - mem_gb= config["assembly_memory"], + mem_gb=config["assembly_memory"], time_min=60 * config["runtime"]["assembly"], shell: # remove pipeline_state file to create all output files again From c30ff088da03c49e925da13b74649b446605037c Mon Sep 17 00:00:00 2001 From: silask Date: Fri, 28 Jun 2024 10:26:11 +0200 Subject: [PATCH 21/37] move changelog --- CHANGELOG.md | 186 ++++++++++++++++++++++++++++++++++++++++ docs/usage/changelog.md | 168 +----------------------------------- 2 files changed, 189 insertions(+), 165 deletions(-) create mode 100644 CHANGELOG.md diff --git a/CHANGELOG.md b/CHANGELOG.md new file mode 100644 index 00000000..2610beb9 --- /dev/null +++ b/CHANGELOG.md @@ -0,0 +1,186 @@ + + + +# Change log + + +## 2.18.1 + +Fix error with downloading DRAM. Update to DRAM v1.5 + +## 2.18 + +- Qc reads, assembly are now written in the sample.tsv from the start. This should fix errors of partial writing to the sample.tsv https://github.com/metagenome-atlas/atlas/issues/695 +- It also allows you to add external assemblies. +- singletons reads are no longer used trough the pipeline. +- This changes the default paths for raw reads and assemblies. +assembly are now in `Assembly/fasta/{sample}.fasta` +reads: `QC/reads/{sample}_{fraction}.fastq.gz` + +**Seemless update**: If you update atlas and continue on an old project. Your old files will be copied. +Or the path defined in the sample.tsv will be used. + + + +## 2.17 + +### Skani +The tool Skani claims to be better and faster than the combination of mash + FastANI as used by dRep +I implemented the skin for species clustering. +We now do the species clustering in the `atlas run binning` step. +So you get information about the number of dereplicated species in the binning report. This allows you to run different binners before choosing the one to use for the genome annotation. +Also, the file storage was improved all important files are in `Binning/{binner}/` + + + +My custom species clustering does the following steps: + +1. Pre-cluster genomes with *single-linkage* at 92.5 ANI. +2. **Re-calibrate checkm2 results.** + - If a minority of genomes from a pre-cluster use a different translation table they are removed + - If some genomes of a pre-cluster don't use the specialed completeness model we re-calibrate completeness to the minimum value. + This ensures that not a bad genome evaluated on the general model is preferred over a better genome evaluated on the specific model. +See also https://silask.github.io/post/better_genomes/ Section 2. +- Drop genomes that don't correspond to the filter criteria after re-calibration +3. Cluster genomes with ANI threshold default 95% +4. Select the best genome as representative based on the Quality score Completeness - 5x Contamination + + + + + +### New Contributors +* @jotech made their first contribution in https://github.com/metagenome-atlas/atlas/pull/667 + +## 2.16 + + * gtdb08 + +## 2.15 + +* Use Gunc +* New Folder organisation: Main output files for Binning are in the new folder `Binning` +* Use hdf-format for gene catalogs. Allow efficient storage and selective access to large count and coverage matrices from the genecatalog. (See docs for how to load them) https://github.com/metagenome-atlas/atlas/pull/621 +* Semibin v. 1.5 by @SilasK in https://github.com/metagenome-atlas/atlas/pull/622 + + +## 2.14 + +* Support for checkm2 by @SilasK in https://github.com/metagenome-atlas/atlas/pull/607 + +Thank you @trickovicmatija for your help. + +**Full Changelog**: https://github.com/metagenome-atlas/atlas/compare/v2.13.1...v2.14.0 +## 2.13 + +* use minimap for contigs, genecatalog and genomes in https://github.com/metagenome-atlas/atlas/pull/569 https://github.com/metagenome-atlas/atlas/pull/577 +* filter genomes my self in https://github.com/metagenome-atlas/atlas/pull/568 +The filter function is defined in the config file: +``` +genome_filter_criteria: "(Completeness-5*Contamination >50 ) & (Length_scaffolds >=50000) & (Ambigious_bases <1e6) & (N50 > 5*1e3) & (N_scaffolds < 1e3)" +``` +The genome filtering is similar as other publications in the field, e.g. GTDB. What is maybe a bit different is that genomes with completeness around 50% **and** contamination around 10% are excluded where as using the default parameters dRep would include those. + +* use Drep again in https://github.com/metagenome-atlas/atlas/pull/579 +We saw better performances using drep. This scales also now to ~1K samples +* Use new Dram version 1.4 by in https://github.com/metagenome-atlas/atlas/pull/564 + + +**Full Changelog**: https://github.com/metagenome-atlas/atlas/compare/v2.12.0...v2.13.0 + +## 2.12 + +* GTDB-tk requires rule `extract_gtdb` to run first by @Waschina in https://github.com/metagenome-atlas/atlas/pull/551 +* use Galah instead of Drep +* use bbsplit for mapping to genomes (maybe move to minimap in future) +* faster gene catalogs quantification using minimap. +* Compatible with snakemake v7.15 +### New Contributors +* @Waschina made their first contribution in https://github.com/metagenome-atlas/atlas/pull/551 + +**Full Changelog**: https://github.com/metagenome-atlas/atlas/compare/v2.11.1...v2.12.0 + +## 2.11 +* Make atlas handle large gene catalogs using parquet and pyfastx (Fix #515) + +parquet files can be opened in python with +``` +import pandas as pd +coverage = pd.read_parquet("working_dir/Genecatalog/counts/median_coverage.parquet") +coverage.set_index("GeneNr", inplace=True) + +``` + +and in R it should be something like: + +``` +arrow::read_parquet("working_dir/Genecatalog/counts/median_coverage.parquet") + +``` + + +**Full Changelog**: https://github.com/metagenome-atlas/atlas/compare/v2.10.0...v2.11.0 + +## [2.10](https://github.com/metagenome-atlas/atlas/compare/v2.9.1...v2.10.0) + +### Features +* GTDB version 207 +* Low memory taxonomic annotation + + +## [2.9](https://github.com/metagenome-atlas/atlas/compare/v2.8.2...v2.9.0) + +### Features +* ✨ Start an atlas project from public data in SRA [Docs](https://metagenome-atlas.readthedocs.io/en/latest/usage/getting_started.html#start-a-new-project-with-public-data) +* Make atlas ready for python 3.10 https://github.com/metagenome-atlas/atlas/pull/498 +* Add strain profiling using inStrain You can run `atlas run genomes strains` + +### New Contributors +* @alienzj made their first contribution to fix config when run DRAM annotate in https://github.com/metagenome-atlas/atlas/pull/495 + + +## 2.8 +This is a major update of metagenome-atlas. It was developed for the [3-day course in Finnland](https://silask.github.io/talk/3-day-course-on-metagenome-atlas/), that's also why it has a finish release name. + + +### New binners +It integrates bleeding-edge binners `Vamb` and `SemiBin` that use Co-binning based on co-abundance. Thank you @yanhui09 and @psj1997 for helping with this. The first results show better results using these binners over the default. + +[See more](https://metagenome-atlas.readthedocs.io/en/v2.8.0/usage/output.html#binning) + +### Pathway annotations +The command `atlas run genomes` produces genome-level functional annotation and Kegg pathways respective modules. It uses DRAM from @shafferm with a hack to produce all available Kegg modules. + +[See more](https://metagenome-atlas.readthedocs.io/en/v2.8.0/usage/output.html#annotations) + +### Genecatalog +The command `atlas run genecatalog` now produces directly the abundance of the different genes. See more in #276 + +> In future this part of the pipeline will include protein assembly to better tackle complicated metagenomes. + +### Minor updates + +#### Reports are back +See for example the [QC report](https://metagenome-atlas.readthedocs.io/en/v2.8.0/_static/QC_report.html) + +#### Update of all underlying tools +All tools use in atlas are now up to date. From assebler to GTDB. +The one exception is, BBmap which contains a [bug](https://sourceforge.net/p/bbmap/tickets/48/) and ignores the minidenty parameter. + +#### Atlas init +Atlas init correctly parses fastq files even if they are in subfolders and if paired-ends are named simply Sample_1/Sample_2. @Sofie8 will be happy about this. +Atlas log uses nice colors. + +#### Default clustering of Subspecies + +The default ANI threshold for genome-dereplication was set to 97.5% to include more sub-species diversity. + +[See more](https://metagenome-atlas.readthedocs.io/en/v2.8.0/usage/output.html#genomes) + + + + + + + + diff --git a/docs/usage/changelog.md b/docs/usage/changelog.md index 9b178ffb..025ba0e6 100644 --- a/docs/usage/changelog.md +++ b/docs/usage/changelog.md @@ -1,167 +1,5 @@ +(_changelog)= - -# Change log - -## 2.17 - -### Skani -The tool Skani claims to be better and faster than the combination of mash + FastANI as used by dRep -I implemented the skin for species clustering. -We now do the species clustering in the `atlas run binning` step. -So you get information about the number of dereplicated species in the binning report. This allows you to run different binners before choosing the one to use for the genome annotation. -Also, the file storage was improved all important files are in `Binning/{binner}/` - - - -My custom species clustering does the following steps: - -1. Pre-cluster genomes with *single-linkage* at 92.5 ANI. -2. **Re-calibrate checkm2 results.** - - If a minority of genomes from a pre-cluster use a different translation table they are removed - - If some genomes of a pre-cluster don't use the specialed completeness model we re-calibrate completeness to the minimum value. - This ensures that not a bad genome evaluated on the general model is preferred over a better genome evaluated on the specific model. -See also https://silask.github.io/post/better_genomes/ Section 2. -- Drop genomes that don't correspond to the filter criteria after re-calibration -3. Cluster genomes with ANI threshold default 95% -4. Select the best genome as representative based on the Quality score Completeness - 5x Contamination - - - - - -### New Contributors -* @jotech made their first contribution in https://github.com/metagenome-atlas/atlas/pull/667 - -## 2.16 - - * gtdb08 - -## 2.15 - -* Use Gunc -* New Folder organisation: Main output files for Binning are in the new folder `Binning` -* Use hdf-format for gene catalogs. Allow efficient storage and selective access to large count and coverage matrices from the genecatalog. (See docs for how to load them) https://github.com/metagenome-atlas/atlas/pull/621 -* Semibin v. 1.5 by @SilasK in https://github.com/metagenome-atlas/atlas/pull/622 - - -## 2.14 - -* Support for checkm2 by @SilasK in https://github.com/metagenome-atlas/atlas/pull/607 - -Thank you @trickovicmatija for your help. - -**Full Changelog**: https://github.com/metagenome-atlas/atlas/compare/v2.13.1...v2.14.0 -## 2.13 - -* use minimap for contigs, genecatalog and genomes in https://github.com/metagenome-atlas/atlas/pull/569 https://github.com/metagenome-atlas/atlas/pull/577 -* filter genomes my self in https://github.com/metagenome-atlas/atlas/pull/568 -The filter function is defined in the config file: -``` -genome_filter_criteria: "(Completeness-5*Contamination >50 ) & (Length_scaffolds >=50000) & (Ambigious_bases <1e6) & (N50 > 5*1e3) & (N_scaffolds < 1e3)" -``` -The genome filtering is similar as other publications in the field, e.g. GTDB. What is maybe a bit different is that genomes with completeness around 50% **and** contamination around 10% are excluded where as using the default parameters dRep would include those. - -* use Drep again in https://github.com/metagenome-atlas/atlas/pull/579 -We saw better performances using drep. This scales also now to ~1K samples -* Use new Dram version 1.4 by in https://github.com/metagenome-atlas/atlas/pull/564 - - -**Full Changelog**: https://github.com/metagenome-atlas/atlas/compare/v2.12.0...v2.13.0 - -## 2.12 - -* GTDB-tk requires rule `extract_gtdb` to run first by @Waschina in https://github.com/metagenome-atlas/atlas/pull/551 -* use Galah instead of Drep -* use bbsplit for mapping to genomes (maybe move to minimap in future) -* faster gene catalogs quantification using minimap. -* Compatible with snakemake v7.15 -### New Contributors -* @Waschina made their first contribution in https://github.com/metagenome-atlas/atlas/pull/551 - -**Full Changelog**: https://github.com/metagenome-atlas/atlas/compare/v2.11.1...v2.12.0 - -## 2.11 -* Make atlas handle large gene catalogs using parquet and pyfastx (Fix #515) - -parquet files can be opened in python with -``` -import pandas as pd -coverage = pd.read_parquet("working_dir/Genecatalog/counts/median_coverage.parquet") -coverage.set_index("GeneNr", inplace=True) - -``` - -and in R it should be something like: - -``` -arrow::read_parquet("working_dir/Genecatalog/counts/median_coverage.parquet") - -``` - - -**Full Changelog**: https://github.com/metagenome-atlas/atlas/compare/v2.10.0...v2.11.0 - -## [2.10](https://github.com/metagenome-atlas/atlas/compare/v2.9.1...v2.10.0) - -### Features -* GTDB version 207 -* Low memory taxonomic annotation - - -## [2.9](https://github.com/metagenome-atlas/atlas/compare/v2.8.2...v2.9.0) - -### Features -* ✨ Start an atlas project from public data in SRA [Docs](https://metagenome-atlas.readthedocs.io/en/latest/usage/getting_started.html#start-a-new-project-with-public-data) -* Make atlas ready for python 3.10 https://github.com/metagenome-atlas/atlas/pull/498 -* Add strain profiling using inStrain You can run `atlas run genomes strains` - -### New Contributors -* @alienzj made their first contribution to fix config when run DRAM annotate in https://github.com/metagenome-atlas/atlas/pull/495 - - -## 2.8 -This is a major update of metagenome-atlas. It was developed for the [3-day course in Finnland](https://silask.github.io/talk/3-day-course-on-metagenome-atlas/), that's also why it has a finish release name. - - -### New binners -It integrates bleeding-edge binners `Vamb` and `SemiBin` that use Co-binning based on co-abundance. Thank you @yanhui09 and @psj1997 for helping with this. The first results show better results using these binners over the default. - -[See more](https://metagenome-atlas.readthedocs.io/en/v2.8.0/usage/output.html#binning) - -### Pathway annotations -The command `atlas run genomes` produces genome-level functional annotation and Kegg pathways respective modules. It uses DRAM from @shafferm with a hack to produce all available Kegg modules. - -[See more](https://metagenome-atlas.readthedocs.io/en/v2.8.0/usage/output.html#annotations) - -### Genecatalog -The command `atlas run genecatalog` now produces directly the abundance of the different genes. See more in #276 - -> In future this part of the pipeline will include protein assembly to better tackle complicated metagenomes. - -### Minor updates - -#### Reports are back -See for example the [QC report](https://metagenome-atlas.readthedocs.io/en/v2.8.0/_static/QC_report.html) - -#### Update of all underlying tools -All tools use in atlas are now up to date. From assebler to GTDB. -The one exception is, BBmap which contains a [bug](https://sourceforge.net/p/bbmap/tickets/48/) and ignores the minidenty parameter. - -#### Atlas init -Atlas init correctly parses fastq files even if they are in subfolders and if paired-ends are named simply Sample_1/Sample_2. @Sofie8 will be happy about this. -Atlas log uses nice colors. - -#### Default clustering of Subspecies - -The default ANI threshold for genome-dereplication was set to 97.5% to include more sub-species diversity. - -[See more](https://metagenome-atlas.readthedocs.io/en/v2.8.0/usage/output.html#genomes) - - - - - - - - +```{include} ../../CHANGELOG.md +``` \ No newline at end of file From ee738640e0bdabdaf88b36da1107af57423ea60b Mon Sep 17 00:00:00 2001 From: silask Date: Fri, 28 Jun 2024 15:12:02 +0200 Subject: [PATCH 22/37] github actions to automate changelog --- .github/workflows/conventional-prs.yml | 22 ++++++++++++++++ .github/workflows/release-please.yml | 35 ++++++++++++++++++++++++++ 2 files changed, 57 insertions(+) create mode 100644 .github/workflows/conventional-prs.yml create mode 100644 .github/workflows/release-please.yml diff --git a/.github/workflows/conventional-prs.yml b/.github/workflows/conventional-prs.yml new file mode 100644 index 00000000..9ac02630 --- /dev/null +++ b/.github/workflows/conventional-prs.yml @@ -0,0 +1,22 @@ +name: PR +on: + pull_request_target: + types: + - opened + - reopened + - edited + - synchronize + +permissions: + contents: read + +jobs: + title-format: + permissions: + pull-requests: read # for amannn/action-semantic-pull-request to analyze PRs + statuses: write # for amannn/action-semantic-pull-request to mark status of analyzed PR + runs-on: ubuntu-latest + steps: + - uses: amannn/action-semantic-pull-request@v5.0.2 + env: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} diff --git a/.github/workflows/release-please.yml b/.github/workflows/release-please.yml new file mode 100644 index 00000000..185fb2e7 --- /dev/null +++ b/.github/workflows/release-please.yml @@ -0,0 +1,35 @@ +on: + push: + branches: + - main + +name: release-please + +jobs: + release-please: + runs-on: ubuntu-latest + steps: + - uses: GoogleCloudPlatform/release-please-action@v3 + id: release + with: + release-type: python + package-name: snakemake + + - uses: actions/checkout@v3 + if: ${{ steps.release.outputs.release_created }} + with: + fetch-depth: 0 + + - name: Set up Python + if: ${{ steps.release.outputs.release_created }} + uses: actions/setup-python@v4 + with: + python-version: "3.x" + + - name: Build and check package + if: ${{ steps.release.outputs.release_created }} + run: | + python -m pip install --upgrade pip + pip install build twine + python -m build + twine check --strict dist/* From 00e91d70db1167a3c9186ab415b8092e9e67ada1 Mon Sep 17 00:00:00 2001 From: silask Date: Fri, 28 Jun 2024 15:27:51 +0200 Subject: [PATCH 23/37] add codespell --- .github/workflows/codespell.yml | 25 +++++++++++++++++++++++++ docs/pyproject.toml | 6 ++++++ 2 files changed, 31 insertions(+) create mode 100644 .github/workflows/codespell.yml create mode 100644 docs/pyproject.toml diff --git a/.github/workflows/codespell.yml b/.github/workflows/codespell.yml new file mode 100644 index 00000000..23cbf7d7 --- /dev/null +++ b/.github/workflows/codespell.yml @@ -0,0 +1,25 @@ +# Codespell configuration is within pyproject.toml +--- +name: Codespell + +on: + push: + branches: [main] + pull_request: + branches: [main] + +permissions: + contents: read + +jobs: + codespell: + name: Check for spelling errors + runs-on: ubuntu-latest + + steps: + - name: Checkout + uses: actions/checkout@v4 + - name: Codespell + uses: codespell-project/actions-codespell@v2 + with: + ignore_words_list: Crate,crate diff --git a/docs/pyproject.toml b/docs/pyproject.toml new file mode 100644 index 00000000..0c628a96 --- /dev/null +++ b/docs/pyproject.toml @@ -0,0 +1,6 @@ +[tool.codespell] +# Ref: https://github.com/codespell-project/codespell#using-a-config-file +skip = '.git,*.pdf,*.svg,versioneer.py,*.css,test_*' +check-hidden = true +ignore-regex = '^\s*"image/\S+": ".*|\b[Mm]anuel[. ][Hh]oltgrewe\b' +ignore-words-list = 'testin' \ No newline at end of file From a9d2193e65ec8ef61ef5ff76941fdcb2c3791a38 Mon Sep 17 00:00:00 2001 From: silask Date: Fri, 28 Jun 2024 17:03:19 +0200 Subject: [PATCH 24/37] add versioneer --- docs/pyproject.toml | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/docs/pyproject.toml b/docs/pyproject.toml index 0c628a96..cdf1fb56 100644 --- a/docs/pyproject.toml +++ b/docs/pyproject.toml @@ -3,4 +3,12 @@ skip = '.git,*.pdf,*.svg,versioneer.py,*.css,test_*' check-hidden = true ignore-regex = '^\s*"image/\S+": ".*|\b[Mm]anuel[. ][Hh]oltgrewe\b' -ignore-words-list = 'testin' \ No newline at end of file +ignore-words-list = 'testin' + + +[tool.versioneer] +VCS = git +style = pep440 +versionfile_source = atlas/_version.py +versionfile_build = atlas/_version.py +tag_prefix = v From 927f64af9d03bb88a532eae042eed91f6066d871 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" <41898282+github-actions[bot]@users.noreply.github.com> Date: Fri, 28 Jun 2024 15:05:50 +0000 Subject: [PATCH 25/37] chore(main): release 2.18.2 --- CHANGELOG.md | 17 +++++++---------- 1 file changed, 7 insertions(+), 10 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 2610beb9..65d83f5c 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,8 +1,13 @@ +# Change log +## [2.18.2](https://github.com/metagenome-atlas/atlas/compare/v2.18.1...v2.18.2) (2024-06-28) -# Change log +### Bug Fixes + +* 676 ([8b4d552](https://github.com/metagenome-atlas/atlas/commit/8b4d5522afe2b35265ea406ac2a4b7d0edf571fb)) +* 701 ([ce22404](https://github.com/metagenome-atlas/atlas/commit/ce224044ee13db9647b74a6cba726006f04ec861)) ## 2.18.1 @@ -175,12 +180,4 @@ Atlas log uses nice colors. The default ANI threshold for genome-dereplication was set to 97.5% to include more sub-species diversity. -[See more](https://metagenome-atlas.readthedocs.io/en/v2.8.0/usage/output.html#genomes) - - - - - - - - +[See more](https://metagenome-atlas.readthedocs.io/en/v2.8.0/usage/output.html#genomes) From 5535622ba608b016b153319eb4c7e0748b23cab7 Mon Sep 17 00:00:00 2001 From: silask Date: Fri, 28 Jun 2024 17:10:14 +0200 Subject: [PATCH 26/37] package is called metagneome-atlas --- .github/workflows/release-please.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/release-please.yml b/.github/workflows/release-please.yml index 185fb2e7..4822251a 100644 --- a/.github/workflows/release-please.yml +++ b/.github/workflows/release-please.yml @@ -13,7 +13,7 @@ jobs: id: release with: release-type: python - package-name: snakemake + package-name: metagenome-atlas - uses: actions/checkout@v3 if: ${{ steps.release.outputs.release_created }} From 01ff80b5eded7ceb432fce77969165bd2937617a Mon Sep 17 00:00:00 2001 From: silask Date: Fri, 28 Jun 2024 17:25:27 +0200 Subject: [PATCH 27/37] add formating --- .github/workflows/format.yml | 38 ++++++++++++++++++++++++++++++++++++ 1 file changed, 38 insertions(+) create mode 100644 .github/workflows/format.yml diff --git a/.github/workflows/format.yml b/.github/workflows/format.yml new file mode 100644 index 00000000..124c4539 --- /dev/null +++ b/.github/workflows/format.yml @@ -0,0 +1,38 @@ +name: CI + +on: + push: + branches: + - main + pull_request: + +concurrency: + # Cancel concurrent flows on PRs + group: ci-${{ github.head_ref || github.run_id }} + cancel-in-progress: true + +jobs: + formatting: + permissions: + contents: read # for actions/checkout to fetch code + pull-requests: write # for marocchino/sticky-pull-request-comment to create or update PR comment + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + + - uses: mamba-org/setup-micromamba@v1 + with: + environment-name: black + create-args: black + cache-environment: true + + - name: Check formatting + shell: bash -el {0} + run: black --check --diff . + + - name: Comment PR + if: github.event_name == 'pull_request' && failure() + uses: marocchino/sticky-pull-request-comment@v2.8.0 + with: + message: "Please format your code with [black](https://black.readthedocs.io): `black . `." + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} From 330cc4f4326f7e0b40067d7fd28fa56cb3404824 Mon Sep 17 00:00:00 2001 From: silask Date: Fri, 28 Jun 2024 17:27:21 +0200 Subject: [PATCH 28/37] snakefmt formating --- .github/workflows/format.yml | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/.github/workflows/format.yml b/.github/workflows/format.yml index 124c4539..2037581a 100644 --- a/.github/workflows/format.yml +++ b/.github/workflows/format.yml @@ -22,17 +22,24 @@ jobs: - uses: mamba-org/setup-micromamba@v1 with: - environment-name: black - create-args: black + environment-name: formatting + create-args: black snakefmt cache-environment: true - - name: Check formatting + - name: Check Black formatting shell: bash -el {0} run: black --check --diff . + - name: Check Snakefmt formatting + shell: bash -el {0} + run: snakefmt --check --diff . + - name: Comment PR if: github.event_name == 'pull_request' && failure() uses: marocchino/sticky-pull-request-comment@v2.8.0 with: - message: "Please format your code with [black](https://black.readthedocs.io): `black . `." + message: | + Please format your code with: + - [black](https://black.readthedocs.io): `black .` + - [snakefmt](https://github.com/snakemake/snakefmt): `snakefmt .` GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} From 16fc1b4b888d0ab831ea572ee535c9cdee4b1221 Mon Sep 17 00:00:00 2001 From: silask Date: Fri, 28 Jun 2024 20:27:45 +0200 Subject: [PATCH 29/37] install snakefmt with bioconda --- .github/workflows/format.yml | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/.github/workflows/format.yml b/.github/workflows/format.yml index 2037581a..7149d903 100644 --- a/.github/workflows/format.yml +++ b/.github/workflows/format.yml @@ -24,6 +24,10 @@ jobs: with: environment-name: formatting create-args: black snakefmt + condarc: | + channels: + - conda-forge + - bioconda cache-environment: true - name: Check Black formatting From d7cf4c4d5082fa02dcfeb08b1bfc15337916d8cd Mon Sep 17 00:00:00 2001 From: silask Date: Fri, 28 Jun 2024 20:36:32 +0200 Subject: [PATCH 30/37] ignore some files --- .github/workflows/codespell.yml | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/.github/workflows/codespell.yml b/.github/workflows/codespell.yml index 23cbf7d7..7b8da68b 100644 --- a/.github/workflows/codespell.yml +++ b/.github/workflows/codespell.yml @@ -22,4 +22,6 @@ jobs: - name: Codespell uses: codespell-project/actions-codespell@v2 with: - ignore_words_list: Crate,crate + check_filenames: true + skip: ".git,*.pdf,*.svg,versioneer.py,*.css,*.html" + check_hidden: true From 8326c57169b6ff1c838b6f196829178d14779b7b Mon Sep 17 00:00:00 2001 From: Silas Kieser Date: Tue, 25 Jun 2024 12:04:56 +0200 Subject: [PATCH 31/37] update verstion and paths --- workflow/rules/download.smk | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/workflow/rules/download.smk b/workflow/rules/download.smk index 85e56cd9..57896928 100644 --- a/workflow/rules/download.smk +++ b/workflow/rules/download.smk @@ -14,6 +14,7 @@ EGGNOG_DIR = DBDIR/ ("EggNOG_V" + EGGNOG_VERSION) CONDAENV = "../envs" +<<<<<<< HEAD ## GTDBTk @@ -62,6 +63,11 @@ rule extract_gtdb: '( cat {input} | tar -xzvf - -C "{GTDBTK_DATA_PATH}" --strip 1 ) 2> {log.stderr} > {log.stdout} ' ### end GTDBTk +======= +GTDB_VERSION = "V09_R200" +GTDB_DATA_URL = "https://data.gtdb.ecogenomic.org/releases/release220/220.0/auxillary_files/gtdbtk_package/full_package/gtdbtk_r220_data.tar.gz" +GTDBTK_DATA_PATH = os.path.join(DBDIR, "GTDB_" + GTDB_VERSION) +>>>>>>> d378298 (update verstion and paths) def md5(fname): From ed35c07d0d1f1ccf540972ec1538736c6a8bf01a Mon Sep 17 00:00:00 2001 From: Silas Kieser Date: Tue, 25 Jun 2024 23:24:04 +0200 Subject: [PATCH 32/37] use split download from gtdbtk --- workflow/rules/download.smk | 68 +++++++++++++++++++++++++++++++++++-- 1 file changed, 65 insertions(+), 3 deletions(-) diff --git a/workflow/rules/download.smk b/workflow/rules/download.smk index 57896928..dcd35f8e 100644 --- a/workflow/rules/download.smk +++ b/workflow/rules/download.smk @@ -1,19 +1,35 @@ import hashlib import os from pathlib import Path +from pathlib import Path # this values are incuded in the snakefile DBDIR = Path(config["database_dir"]).resolve() +GUNCDIR = DBDIR/ "gunc_database" +BUSCODIR = DBDIR/ "busco_lineages" +DBDIR = Path(config["database_dir"]).resolve() + GUNCDIR = DBDIR/ "gunc_database" BUSCODIR = DBDIR/ "busco_lineages" ZENODO_ARCHIVE = "1134890" EGGNOG_VERSION = "5" EGGNOG_DIR = DBDIR/ ("EggNOG_V" + EGGNOG_VERSION) +EGGNOG_DIR = DBDIR/ ("EggNOG_V" + EGGNOG_VERSION) CONDAENV = "../envs" + +## GTDBTk + + +## GTDBTk + +<<<<<<< HEAD + +## GTDBTk + <<<<<<< HEAD ## GTDBTk @@ -65,9 +81,50 @@ rule extract_gtdb: ### end GTDBTk ======= GTDB_VERSION = "V09_R200" -GTDB_DATA_URL = "https://data.gtdb.ecogenomic.org/releases/release220/220.0/auxillary_files/gtdbtk_package/full_package/gtdbtk_r220_data.tar.gz" -GTDBTK_DATA_PATH = os.path.join(DBDIR, "GTDB_" + GTDB_VERSION) ->>>>>>> d378298 (update verstion and paths) +GTDB_DATA_URL = "https://data.gtdb.ecogenomic.org/releases/release220/220.0/auxillary_files/gtdbtk_package" +GTDBTK_DATA_PATH = DBDIR/ ("GTDB_" + GTDB_VERSION) + + +def all_partial_gtdb_tarbals(wildcards,GTDB_REFSEQ_VERSION=220,GTDB_PATIAL_SUFFIXES=["a"+i for i in "abcdefghijk"]): + + return expand(GTDBTK_DATA_PATH/"gtdbtk_r{gtdb_refseq_version}_data.tar.gz.part_{suffix}", + gtdb_refseq_version= GTDB_REFSEQ_VERSION, + suffix=GTDB_PATIAL_SUFFIXES) + + +localrules: + download_partial_gtdb, extract_gtdb + + +rule download_partial_gtdb: + output: + temp(GTDBTK_DATA_PATH/"gtdbtk_r{gtdb_refseq_version}_data.tar.gz.part_{suffix}"), + threads: 1 + params: + url = lambda wc,output: f"{GTDB_DATA_URL}/split_package/{ Path(output[0]).name}" + resources: + time_min=60 * int(config.get("runtime", {"long": 10})["long"]), + log: + "logs/download/gtdbtk_r{gtdb_refseq_version}_part_{suffix}.log", + shell: + " wget --no-check-certificate {params.url} -O {output} &> {log} " + + +rule extract_gtdb: + input: + all_partial_gtdb_tarbals + output: + touch(os.path.join(GTDBTK_DATA_PATH, "downloaded_success")), + threads: 1 + resources: + time_min=60 * int(config.get("runtime", {"long": 10})["long"]), + log: + stdout="logs/download/gtdbtk_untar.log", + stderr="logs/download/gtdbtk_untar.err", + shell: + '( cat {input} | tar -xzvf - -C "{GTDBTK_DATA_PATH}" --strip 1 ) 2> {log.stderr} > {log.stdout} ' + +### end GTDBTk def md5(fname): @@ -91,6 +148,10 @@ FILES = { +} + + + def get_eggnog_db_file(): return ancient( @@ -120,6 +181,7 @@ rule download: get_eggnog_db_file(), f"{DBDIR}/CheckM2", GTDBTK_DATA_PATH/ "downloaded_success" + GTDBTK_DATA_PATH/ "downloaded_success" rule download_eggNOG_files: From 1ebc799ea91930bdbe271f1131667808353eb2fb Mon Sep 17 00:00:00 2001 From: silask Date: Fri, 28 Jun 2024 17:22:33 +0200 Subject: [PATCH 33/37] specify target rule when downloading --- atlas/atlas.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/atlas/atlas.py b/atlas/atlas.py index 265f2665..c5e42cba 100644 --- a/atlas/atlas.py +++ b/atlas/atlas.py @@ -245,7 +245,7 @@ def run_download(db_dir, jobs, snakemake_args): """ cmd = ( - "snakemake --snakefile {snakefile} " + "snakemake --snakefile {snakefile} download " "--jobs {jobs} --rerun-incomplete " "--conda-frontend mamba --scheduler greedy " "--nolock --use-conda --conda-prefix {conda_prefix} " From 703780b3cf927d22af87f42805859f63c6038f27 Mon Sep 17 00:00:00 2001 From: silask Date: Fri, 28 Jun 2024 17:23:09 +0200 Subject: [PATCH 34/37] format --- workflow/rules/download.smk | 114 ++++++++---------------------------- 1 file changed, 25 insertions(+), 89 deletions(-) diff --git a/workflow/rules/download.smk b/workflow/rules/download.smk index dcd35f8e..f2ef654b 100644 --- a/workflow/rules/download.smk +++ b/workflow/rules/download.smk @@ -6,102 +6,49 @@ from pathlib import Path # this values are incuded in the snakefile DBDIR = Path(config["database_dir"]).resolve() -GUNCDIR = DBDIR/ "gunc_database" -BUSCODIR = DBDIR/ "busco_lineages" -DBDIR = Path(config["database_dir"]).resolve() - -GUNCDIR = DBDIR/ "gunc_database" -BUSCODIR = DBDIR/ "busco_lineages" +GUNCDIR = DBDIR / "gunc_database" +BUSCODIR = DBDIR / "busco_lineages" ZENODO_ARCHIVE = "1134890" EGGNOG_VERSION = "5" -EGGNOG_DIR = DBDIR/ ("EggNOG_V" + EGGNOG_VERSION) -EGGNOG_DIR = DBDIR/ ("EggNOG_V" + EGGNOG_VERSION) +EGGNOG_DIR = DBDIR / ("EggNOG_V" + EGGNOG_VERSION) CONDAENV = "../envs" ## GTDBTk - -## GTDBTk - -<<<<<<< HEAD - -## GTDBTk - -<<<<<<< HEAD - -## GTDBTk - -GTDB_VERSION = "V09_R200" -GTDB_DATA_URL = "https://data.gtdb.ecogenomic.org/releases/release220/220.0/auxillary_files/gtdbtk_package" -GTDBTK_DATA_PATH = DBDIR/ ("GTDB_" + GTDB_VERSION) - - -def all_partial_gtdb_tarbals(wildcards,GTDB_REFSEQ_VERSION=220,GTDB_PATIAL_SUFFIXES=["a"+i for i in "abcdefghijk"]): - - return expand(GTDBTK_DATA_PATH/"gtdbtk_r{gtdb_refseq_version}_data.tar.gz.part_{suffix}", - gtdb_refseq_version= GTDB_REFSEQ_VERSION, - suffix=GTDB_PATIAL_SUFFIXES) - - -localrules: - download_partial_gtdb, extract_gtdb - - -rule download_partial_gtdb: - output: - temp(GTDBTK_DATA_PATH/"gtdbtk_r{gtdb_refseq_version}_data.tar.gz.part_{suffix}"), - threads: 1 - params: - url = lambda wc,output: f"{GTDB_DATA_URL}/split_package/{ Path(output[0]).name}" - resources: - time_min=60 * int(config.get("runtime", {"long": 10})["long"]), - log: - "logs/download/gtdbtk_r{gtdb_refseq_version}_part_{suffix}.log", - shell: - " wget --no-check-certificate {params.url} -O {output} &> {log} " - - -rule extract_gtdb: - input: - all_partial_gtdb_tarbals - output: - touch(os.path.join(GTDBTK_DATA_PATH, "downloaded_success")), - threads: 1 - resources: - time_min=60 * int(config.get("runtime", {"long": 10})["long"]), - log: - stdout="logs/download/gtdbtk_untar.log", - stderr="logs/download/gtdbtk_untar.err", - shell: - '( cat {input} | tar -xzvf - -C "{GTDBTK_DATA_PATH}" --strip 1 ) 2> {log.stderr} > {log.stdout} ' - -### end GTDBTk -======= GTDB_VERSION = "V09_R200" GTDB_DATA_URL = "https://data.gtdb.ecogenomic.org/releases/release220/220.0/auxillary_files/gtdbtk_package" -GTDBTK_DATA_PATH = DBDIR/ ("GTDB_" + GTDB_VERSION) - - -def all_partial_gtdb_tarbals(wildcards,GTDB_REFSEQ_VERSION=220,GTDB_PATIAL_SUFFIXES=["a"+i for i in "abcdefghijk"]): - - return expand(GTDBTK_DATA_PATH/"gtdbtk_r{gtdb_refseq_version}_data.tar.gz.part_{suffix}", - gtdb_refseq_version= GTDB_REFSEQ_VERSION, - suffix=GTDB_PATIAL_SUFFIXES) +GTDBTK_DATA_PATH = DBDIR / ("GTDB_" + GTDB_VERSION) + + +def all_partial_gtdb_tarbals( + wildcards, + GTDB_REFSEQ_VERSION=220, + GTDB_PATIAL_SUFFIXES=["a" + i for i in "abcdefghijk"], +): + return expand( + GTDBTK_DATA_PATH / "gtdbtk_r{gtdb_refseq_version}_data.tar.gz.part_{suffix}", + gtdb_refseq_version=GTDB_REFSEQ_VERSION, + suffix=GTDB_PATIAL_SUFFIXES, + ) localrules: - download_partial_gtdb, extract_gtdb + download_partial_gtdb, + extract_gtdb, rule download_partial_gtdb: output: - temp(GTDBTK_DATA_PATH/"gtdbtk_r{gtdb_refseq_version}_data.tar.gz.part_{suffix}"), + temp( + GTDBTK_DATA_PATH + / "gtdbtk_r{gtdb_refseq_version}_data.tar.gz.part_{suffix}" + ), threads: 1 params: - url = lambda wc,output: f"{GTDB_DATA_URL}/split_package/{ Path(output[0]).name}" + url=lambda wc, output: f"{GTDB_DATA_URL}/split_package/{ Path(output[0]).name}", resources: time_min=60 * int(config.get("runtime", {"long": 10})["long"]), log: @@ -112,7 +59,7 @@ rule download_partial_gtdb: rule extract_gtdb: input: - all_partial_gtdb_tarbals + all_partial_gtdb_tarbals, output: touch(os.path.join(GTDBTK_DATA_PATH, "downloaded_success")), threads: 1 @@ -143,16 +90,9 @@ FILES = { "adapters.fa": "ae839dc79cfb855a1b750a0d593fe01e", "phiX174_virus.fa": "82516880142e8c89b466bc6118696c47", "silva_rfam_all_rRNAs.fa": "f102e35d9f48eabeb0efe9058559bc66", - } - -} - - - - def get_eggnog_db_file(): return ancient( expand( @@ -180,8 +120,7 @@ rule download: ), get_eggnog_db_file(), f"{DBDIR}/CheckM2", - GTDBTK_DATA_PATH/ "downloaded_success" - GTDBTK_DATA_PATH/ "downloaded_success" + GTDBTK_DATA_PATH / "downloaded_success", rule download_eggNOG_files: @@ -209,9 +148,6 @@ rule download_atlas_files: raise OSError(2, "Invalid checksum", output[0]) - - - rule checkm2_download_db: output: directory(f"{DBDIR}/CheckM2"), From ae3b01184236689dbf2497153eab6d47223edf7c Mon Sep 17 00:00:00 2001 From: Silas Kieser Date: Sat, 27 Jul 2024 11:47:05 +0200 Subject: [PATCH 35/37] sterr and stdout log for extract gtdb --- workflow/rules/download.smk | 3 +++ 1 file changed, 3 insertions(+) diff --git a/workflow/rules/download.smk b/workflow/rules/download.smk index f2ef654b..1a6687c5 100644 --- a/workflow/rules/download.smk +++ b/workflow/rules/download.smk @@ -68,9 +68,12 @@ rule extract_gtdb: log: stdout="logs/download/gtdbtk_untar.log", stderr="logs/download/gtdbtk_untar.err", + stdout="logs/download/gtdbtk_untar.log", + stderr="logs/download/gtdbtk_untar.err", shell: '( cat {input} | tar -xzvf - -C "{GTDBTK_DATA_PATH}" --strip 1 ) 2> {log.stderr} > {log.stdout} ' + ### end GTDBTk From e988caff4679024129017000755285b778d0a214 Mon Sep 17 00:00:00 2001 From: Silas Kieser Date: Sat, 27 Jul 2024 11:47:05 +0200 Subject: [PATCH 36/37] sterr and stdout log for extract gtdb --- workflow/rules/download.smk | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/workflow/rules/download.smk b/workflow/rules/download.smk index 1a6687c5..a18702af 100644 --- a/workflow/rules/download.smk +++ b/workflow/rules/download.smk @@ -68,10 +68,9 @@ rule extract_gtdb: log: stdout="logs/download/gtdbtk_untar.log", stderr="logs/download/gtdbtk_untar.err", - stdout="logs/download/gtdbtk_untar.log", - stderr="logs/download/gtdbtk_untar.err", shell: '( cat {input} | tar -xzvf - -C "{GTDBTK_DATA_PATH}" --strip 1 ) 2> {log.stderr} > {log.stdout} ' + '( cat {input} | tar -xzvf - -C "{GTDBTK_DATA_PATH}" --strip 1 ) 2> {log.stderr} > {log.stdout} ' ### end GTDBTk From 633c7b32cb740a488c626ce6087bc3f67aee4785 Mon Sep 17 00:00:00 2001 From: Silas Kieser Date: Sun, 28 Jul 2024 13:55:46 +0200 Subject: [PATCH 37/37] Update CHANGELOG.md --- CHANGELOG.md | 3 +++ 1 file changed, 3 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 65d83f5c..89e1b99e 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,8 @@ # Change log +## [2.19.0](https://github.com/metagenome-atlas/atlas/compare/v2.18.2...v2.19.0) (2024-07-28) +* GTDB V9 R220 +* Spades v4 ## [2.18.2](https://github.com/metagenome-atlas/atlas/compare/v2.18.1...v2.18.2) (2024-06-28)