diff --git a/.github/workflows/deploy.yml b/.github/workflows/deploy.yml
index aea5027..21208a4 100644
--- a/.github/workflows/deploy.yml
+++ b/.github/workflows/deploy.yml
@@ -41,11 +41,10 @@ jobs:
machine: [x64_linux]
platform: [avx2]
compiler: [14]
- leiden: [true]
include:
- - {machine: arm64_linux, platform: arm8, compiler: 12, leiden: true}
- - {machine: x64_mac, platform: avx2, compiler: 12, leiden: false}
- - {machine: arm64_mac, platform: m1, compiler: 13, leiden: false}
+ - {machine: arm64_linux, platform: arm8, compiler: 12}
+ - {machine: x64_mac, platform: avx2, compiler: 12}
+ - {machine: arm64_mac, platform: m1, compiler: 12}
runs-on: [self-hosted, vclust, '${{ matrix.machine }}']
env:
@@ -53,7 +52,7 @@ jobs:
steps:
- name: make
- run: make -j32 CXX=g++-${{matrix.compiler}} CC=gcc-${{matrix.compiler}} STATIC_LINK=true PLATFORM=${{ matrix.platform }} LEIDEN=${{ matrix.leiden }}
+ run: make -j32 CXX=g++-${{matrix.compiler}} CC=gcc-${{matrix.compiler}} PLATFORM=${{ matrix.platform }} LEIDEN=true
- name: tar artifacts
run: |
mkdir ${DIR}
diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml
index c657d38..3d56435 100644
--- a/.github/workflows/main.yml
+++ b/.github/workflows/main.yml
@@ -1,4 +1,4 @@
-name: GitHub Actions CI
+name: Build and tests
on:
push:
@@ -16,7 +16,7 @@ jobs:
fail-fast: false
matrix:
machine: [ubuntu-latest, macOS-12]
- compiler: [g++-12]
+ compiler: [12]
runs-on: ['${{ matrix.machine }}']
steps:
@@ -26,7 +26,7 @@ jobs:
- name: make
run: |
- make -j32 CXX=${{matrix.compiler}}
+ make -j32 CXX=g++-${{matrix.compiler}} CC=gcc-${{matrix.compiler}}
- name: tar artifacts
run: tar -cvf vclust.tar ./vclust.py ./test.py ./example ./bin/kmer-db ./bin/lz-ani ./bin/clusty ./bin/multi-fasta-split
@@ -43,7 +43,6 @@ jobs:
fail-fast: false
matrix:
machine: [ubuntu-latest, macOS-12]
- compiler: [g++-11]
runs-on: ['${{ matrix.machine }}']
diff --git a/.github/workflows/self-hosted.yml b/.github/workflows/self-hosted.yml
index 64c8120..fc4e6f5 100644
--- a/.github/workflows/self-hosted.yml
+++ b/.github/workflows/self-hosted.yml
@@ -41,21 +41,20 @@ jobs:
machine: [x64_linux]
platform: [avx2]
compiler: [14]
- leiden: [true]
include:
- - {machine: arm64_linux, platform: arm8, compiler: 12, leiden: true}
- - {machine: x64_mac, platform: avx2, compiler: 12, leiden: false}
- - {machine: arm64_mac, platform: m1, compiler: 13, leiden: false}
+ - {machine: arm64_linux, platform: arm8, compiler: 12}
+ - {machine: x64_mac, platform: avx2, compiler: 12}
+ - {machine: arm64_mac, platform: m1, compiler: 12}
runs-on: [self-hosted, vclust, '${{ matrix.machine }}']
steps:
- name: make
- run: make -j32 CXX=g++-${{matrix.compiler}} CC=gcc-${{matrix.compiler}} STATIC_LINK=true PLATFORM=${{ matrix.platform }} LEIDEN=${{ matrix.leiden }}
+ run: make -j32 CXX=g++-${{matrix.compiler}} CC=gcc-${{matrix.compiler}} PLATFORM=${{ matrix.platform }} LEIDEN=true
########################################################################################
- pipeline:
- name: Pipeline
+ pipeline-linux:
+ name: Pipeline (linux)
needs: make
strategy:
fail-fast: false
@@ -64,7 +63,23 @@ jobs:
runs-on: [self-hosted, vclust, '${{ matrix.machine }}']
steps:
-
- name: run pipeline
run: |
pytest test.py
+
+ ########################################################################################
+ pipeline-macos:
+ name: Pipeline (macOS)
+ needs: make
+ strategy:
+ fail-fast: false
+ matrix:
+ machine: [x64_mac, arm64_mac]
+ runs-on: [self-hosted, vclust, '${{ matrix.machine }}']
+
+ steps:
+
+ - name: run pipeline
+ run: |
+ source /Users/agudys/agudys-env/bin/activate
+ pytest test.py
diff --git a/3rd_party/clusty b/3rd_party/clusty
index 5503c50..d80c26a 160000
--- a/3rd_party/clusty
+++ b/3rd_party/clusty
@@ -1 +1 @@
-Subproject commit 5503c507a55f4982f307e0c3585fbc749aa46bbb
+Subproject commit d80c26aec4c09a4715cb43763fa66c5baf8d9968
diff --git a/README.md b/README.md
index d895192..2322090 100644
--- a/README.md
+++ b/README.md
@@ -1,8 +1,8 @@
# Vclust
-![version](https://img.shields.io/badge/version-1.2.5-blue.svg)
+![version](https://img.shields.io/badge/version-1.2.6-blue.svg)
[![GitHub downloads](https://img.shields.io/github/downloads/refresh-bio/vclust/total.svg?style=flag&label=GitHub%20downloads)](https://github.com/refresh-bio/vclust/releases)
-[![GitHub Actions CI](../../workflows/GitHub%20Actions%20CI/badge.svg)](../../actions/workflows/main.yml)
+[![Build and tests](../../workflows/Build%20and%20tests/badge.svg)](../../actions/workflows/main.yml)
[![License: GPL v3](https://img.shields.io/badge/License-GPLv3-blue.svg)](https://www.gnu.org/licenses/gpl-3.0)
![x86-64](https://img.shields.io/static/v1?label=%E2%80%8B&message=x86-64&color=yellow&logo=PCGamingWiki&logoColor=white)
@@ -13,33 +13,7 @@
Vclust is an alignment-based tool for fast and accurate calculation of Average Nucleotide Identity (ANI) between complete or metagenomically-assembled viral genomes. The tool also performs ANI-based clustering of genomes according to standards recommended by international virus consortia, including *International Committee on Taxonomy of Viruses* (ICTV) and *Minimum Information about an Uncultivated Virus Genome* (MIUViG).
-
-## Table of contents
-
-1. [Features](#1-features)
-2. [Requirements](#2-requirements)
-3. [Installation](#3-installation)
-4. [Quick Start](#4-quick-start)
-5. [Input data](#5-input-data)
-6. [Usage](#6-usage)
- 1. [Prefilter](#61-prefilter)
- 2. [Align](#62-align)
- 3. [Cluster](#63-cluster)
-7. [Use cases](#7-use-cases)
- 1. [Classify viruses into species and genera following ICTV standards](#71-classify-viruses-into-species-and-genera-following-ictv-standards)
- 2. [Assign viral contigs into vOTUs following MIUViG standards](#72-assign-viral-contigs-into-votus-following-miuvig-standards)
- 3. [Dereplicate viral contigs into representative genomes](#73-dereplicate-viral-contigs-into-representative-genomes)
- 4. [Calculate pairwise similarities between all-versus-all genomes](#74-calculate-pairwise-similarities-between-all-versus-all-genomes)
- 5. [Process large dataset of diverse virus genomes (IMG/VR)](#75-process-large-dataset-of-diverse-virus-genomes-imgvr)
- 6. [Process large dataset of highly redundant virus genomes](#76-process-large-dataset-of-highly-redundant-virus-genomes)
- 7. [Cluster plasmid genomes into pOTUs](#77-cluster-plasmid-genomes-into-potus)
-8. [FAQ](#8-faq)
-9. [Tests](#9-tests)
-10. [Citation](#10-citation)
-11. [License](#11-license)
-
-
-## 1. Features
+## Features
#### :gem: Accurate ANI calculations
@@ -50,10 +24,10 @@ Vclust uses a Lempel-Ziv-based pairwise sequence aligner ([LZ-ANI](https://githu
Vclust offers multiple similarity measures between two genome sequences:
- **ANI**: The number of identical nucleotides across local alignments divided by the total length of the alignments.
- **Global ANI (gANI)**: The number of identical nucleotides across local alignments divided by the length of the query/reference genome.
-- **Total ANI (tANI)**: The number of identical nucleotides between query-reference and reference-query genomes divided by the sum length of both genomes. tANI is equivalent to the VIRIDIC's intergenomic similarity.
+- **Total ANI (tANI)**: The number of identical nucleotides between query-reference and reference-query genomes divided by the sum length of both genomes. tANI is equivalent to the [VIRIDIC's intergenomic similarity](https://doi.org/10.3390/v12111268).
- **Coverage (alignment fraction)**: The proportion of the query/reference sequence aligned with the reference/query sequence.
-- **Number of local alignments**: The count of individual alignments found between the sequences.
-- **Ratio between genome lengths**: Ratio by dividing the length of the shorter sequence by the length of the longer sequence.
+- **Number of local alignments**: The number of local alignments between the two genome sequences.
+- **Ratio between genome lengths**: The length of the shorter genome divided by the longer one.
#### :star2: Multiple clustering algorithms
@@ -73,563 +47,50 @@ Vclust uses three efficient C++ tools - [Kmer-db](https://github.com/refresh-bio
For datasets containing up to 1000 viral genomes, Vclust is available at [http://www.vclust.org](http://www.vclust.org).
-## 2. Requirements
-
-Vclust requires Python 3.7 or higher.
-
-## 3. Installation
-
-To install Vclust, you can either download the pre-compiled binaries, install from Bioconda, or compile the dependencies from source. The compilation process typically takes a few minutes.
-
-### 3.1. Download precompiled binaries
-
-The quickest way to get started is by downloading prebuilt binaries from the [Releases tab](https://github.com/refresh-bio/vclust/releases). These binaries include the Leiden algorithm by default. Select your platform and download the tool.
-
-### 3.2. Installation via Bioconda
-
-Vclust is also available on Bioconda.
-
-```bash
-TODO
-```
-
-### 3.3. Compile from source
-
-#### 3.3.1. Requirements
-
-1. `make`
-2. `g++` version 11 or higher
-3. `cmake` version 3.12 or higher
-
-#### 3.3.2. Default Installation
-
-The default installation of Vclust includes all functionalities except for the Leiden clustering algorithm.
-
-```bash
-git clone --recurse-submodules https://github.com/refresh-bio/vclust
-cd vclust
-make -j
-```
-
-#### 3.3.3. Installation with Leiden algorithm support
-
-Vclust provides igraph's implementation of the Leiden algorithm. However, because igraph requires several external dependencies (CMake 3.18, Flex, Bison), it is not integrated with Vclust by default. To install these dependencies under Debian/Ubuntu Linux, use the following command:
-
-```bash
-sudo apt-get install cmake flex bison
-```
-
-Then, build Vclust with Leiden algorithm support:
+## Quick start
```bash
+# Clone repository and build Vclust
git clone --recurse-submodules https://github.com/refresh-bio/vclust
-cd vclust
-make -j LEIDEN=true
-```
-
-## 4. Quick start
-
-Follow these steps to quickly run Vclust on the provided example genomes. The process takes just a few seconds.
+cd vclust && make -j
-1. **Prefilter** similar genome sequence pairs before conducting pairwise alignments.
-
-```bash
+# Prefilter similar genome sequence pairs before conducting pairwise alignments.
./vclust.py prefilter -i example/multifasta.fna -o fltr.txt
-```
-
-2. **Align** similar genome sequence pairs and calculate pairwise ANI measures.
-```bash
+# Align similar genome sequence pairs and calculate pairwise ANI measures.
./vclust.py align -i example/multifasta.fna -o ani.tsv --filter fltr.txt
-```
-3. **Cluster** genome sequences based on given ANI measure and minimum threshold.
-
-```bash
+# Cluster genome sequences based on given ANI measure and minimum threshold.
./vclust.py cluster -i ani.tsv -o clusters.tsv --ids ani.ids.tsv --metric ani --ani 0.95
```
+## Documentation
-## 5. Input data
-
-Vclust accepts a single FASTA file containing viral genomic sequences ([example](./example/multifasta.fna)) or a directory of FASTA files (one genome per file) ([example](./example/fna/)). The input file(s) can be gzipped.
-
-## 6. Usage
-
-Vclust provides three commands: `prefilter`, `align`, and `cluster`. Calls to these commands follow the structure:
-
-```
-./vclust.py command -i -o