connor-lab · m-bull · Apr 9, 2021 · Apr 9, 2021 · Apr 9, 2021 · Apr 9, 2021
diff --git a/.github/scripts/test_PR_against_release.sh b/.github/scripts/test_PR_against_release.sh
@@ -6,7 +6,7 @@ export PATH=/opt/conda/bin:$PATH
 singularity --version
 # write test log as github Action artifact
 echo Nextflow run current PR in --illumina mode.. >> artifacts/test_artifact.log
-NXF_VER=20.03.0-edge nextflow run ./main.nf \
+nextflow run ./main.nf \
        -profile singularity \
        --directory $PWD/.github/data/fastqs/ \
        --illumina \
@@ -25,7 +25,7 @@ git checkout tags/v1.1.1
 sed -i s'/cpus = 4/cpus = 2/'g conf/resources.config
 ln -s ../*.sif ./
 echo Nextflow run previous release in --illumina mode.. >> ../artifacts/test_artifact.log
-NXF_VER=20.03.0-edge nextflow run ./main.nf \
+nextflow run ./main.nf \
        -profile singularity \
        --directory $PWD/../.github/data/fastqs/ \
        --illumina \

diff --git a/.github/scripts/test_bed_ref_input.sh b/.github/scripts/test_bed_ref_input.sh
@@ -14,7 +14,7 @@ echo bed file: $BED_FILE >> artifacts/test_artifact.log
 # run current pull request code
 singularity --version
 echo Nextflow run --illumina mode with --ref, --bed .. >> artifacts/test_artifact.log
-NXF_VER=20.03.0-edge nextflow run ./main.nf \
+nextflow run ./main.nf \
        -profile singularity \
        --ref $REF_FILE \
        --bed $BED_FILE \
@@ -58,7 +58,7 @@ rm $REF_FILE
 ln -s $REAL_REF $REF_FILE
 
 echo Nextflow run --illumina mode with symlinked --ref, --bed .. >> artifacts/test_artifact.log
-NXF_VER=20.03.0-edge nextflow run ./main.nf \
+nextflow run ./main.nf \
        -profile singularity \
        --ref $REF_FILE \
        --bed $BED_FILE \

diff --git a/.github/scripts/test_conda_cache.sh b/.github/scripts/test_conda_cache.sh
@@ -10,7 +10,7 @@ export REPO=$PWD
 echo REPO=$REPO >> artifacts/test_artifact.log
 cd ..
 echo PWD=$PWD >> $REPO/artifacts/test_artifact.log
-NXF_VER=20.03.0-edge nextflow run $REPO \
+nextflow run $REPO \
        -profile conda \
        --cache $REPO/conda_cache_dir \
        --directory $REPO/.github/data/fastqs/ \
@@ -25,7 +25,7 @@ cat .nextflow.log | grep 'Conda create complete env=/home/runner/work/ncov2019-a
 rm -rf results && rm -rf work && rm -rf .nextflow*
 # second NF run will use the conda env created in the previous run
 echo re-run pipeline with conda --cache.. >> $REPO/artifacts/test_artifact.log
-NXF_VER=20.03.0-edge nextflow run $REPO \
+nextflow run $REPO \
        -profile conda \
        --cache $REPO/conda_cache_dir \
        --directory $REPO/.github/data/fastqs/ \

diff --git a/.github/scripts/test_cram_input.sh b/.github/scripts/test_cram_input.sh
@@ -14,7 +14,7 @@ echo bed file: $BED_FILE
 sed -i s'/cpus = 4/cpus = 2/'g conf/coguk/sanger.config
 singularity --version
 echo Nextflow run --illumina mode with --ref, --bed and --cram.. >> artifacts/test_artifact.log
-NXF_VER=20.03.0-edge nextflow run ./main.nf \
+nextflow run ./main.nf \
        -profile sanger,singularity \
        --ref $REF_FILE \
        --bed $BED_FILE \

diff --git a/.github/scripts/test_cram_output.sh b/.github/scripts/test_cram_output.sh
@@ -14,7 +14,7 @@ echo bed file: $BED_FILE
 sed -i s'/cpus = 4/cpus = 2/'g conf/coguk/sanger.config
 singularity --version
 echo Nextflow run --illumina mode with --ref, --bed --cram and outCram... >> artifacts/test_artifact.log
-NXF_VER=20.03.0-edge nextflow run ./main.nf \
+nextflow run ./main.nf \
        -profile sanger,singularity \
        --ref $REF_FILE \
        --bed $BED_FILE \

diff --git a/.github/scripts/test_nanopore_pipelines.sh b/.github/scripts/test_nanopore_pipelines.sh
@@ -6,7 +6,7 @@ export PATH=/opt/conda/bin:$PATH
 singularity --version
 # write test log as github Action artifact
 echo "Nextflow run current PR in --nanopolish mode (no barcodes).." >> artifacts/test_artifact.log
-NXF_VER=20.03.0-edge nextflow run ./main.nf \
+nextflow run ./main.nf \
        -profile singularity \
        --nanopolish \
        --sequencing_summary $PWD/.github/data/nanopore/20200311_1427_X4_FAK72834_a3787181/sequencing_summary_FAK72834_298b7829.txt \
@@ -17,7 +17,7 @@ cp .nextflow.log artifacts/nanopolish.nextflow.log
 rm -rf results && rm -rf work && rm -rf .nextflow*
 
 echo "Nextflow run current PR in --nanopolish mode (barcodes).." >> artifacts/test_artifact.log
-NXF_VER=20.03.0-edge nextflow run ./main.nf \
+nextflow run ./main.nf \
        -profile singularity \
        --nanopolish \
        --sequencing_summary $PWD/.github/data/nanopore/20200311_1427_X1_FAK72834_a3787181/sequencing_summary_FAK72834_298b7829.txt \
@@ -28,18 +28,20 @@ cp .nextflow.log artifacts/nanopolish_barcodes.nextflow.log
 rm -rf results && rm -rf work && rm -rf .nextflow*
 
 echo "Nextflow run current PR in --medaka mode (no barcodes).." >> artifacts/test_artifact.log
-NXF_VER=20.03.0-edge nextflow run ./main.nf \
+nextflow run ./main.nf \
        -profile singularity \
        --medaka \
+       --medaka_model r941_min_fast_g303 \
        --basecalled_fastq $PWD/.github/data/nanopore/20200311_1427_X4_FAK72834_a3787181/fastq_pass/ \
        --prefix 20200311_1427_X4_FAK72834_a3787181
 cp .nextflow.log artifacts/medaka.nextflow.log
 rm -rf results && rm -rf work && rm -rf .nextflow*
 
 echo "Nextflow run current PR in --medaka mode (barcodes).." >> artifacts/test_artifact.log
-NXF_VER=20.03.0-edge nextflow run ./main.nf \
+nextflow run ./main.nf \
        -profile singularity \
        --medaka \
+       --medaka_model r941_min_fast_g303 \
        --basecalled_fastq $PWD/.github/data/nanopore/20200311_1427_X1_FAK72834_a3787181/fastq_pass/ \
        --prefix 20200311_1427_X1_FAK72834_a3787181
 

diff --git a/.github/scripts/test_sanger_profile.sh b/.github/scripts/test_sanger_profile.sh
@@ -6,7 +6,7 @@ export PATH=/opt/conda/bin:$PATH
 # there are only 2 available cpus in the github runner execution
 sed -i s'/cpus = 4/cpus = 2/'g conf/coguk/sanger.config
 echo run pipeline in --illumina mode with --sanger profile.. >> artifacts/test_artifact.log
-NXF_VER=20.03.0-edge nextflow run ./main.nf \
+nextflow run ./main.nf \
        -profile sanger,singularity \
        --directory $PWD/.github/data/fastqs/ \
        --illumina \

diff --git a/.github/scripts/test_typing.sh b/.github/scripts/test_typing.sh
@@ -4,12 +4,16 @@ export PATH=/opt/conda/bin:$PATH
 
 # run current pull request code
 singularity --version
+
+# Clone variant_definitions repo
+git clone https://github.com/phe-genomics/variant_definitions.git
+
 # write test log as github Action artifact
 echo "Nextflow run current PR in --nanopolish mode with typing.." >> artifacts/test_artifact.log
-NXF_VER=20.03.0-edge nextflow run ./main.nf \
+nextflow run ./main.nf \
        -profile singularity \
-       --gff $PWD/typing/MN908947.3.gff \
-       --yaml $PWD/typing/SARS-CoV-2.types.yaml \
+       --gb $PWD/typing/NC_045512.2.gb \
+       --variant_definitions  $PWD/variant_definitions/variant_yaml \
        --nanopolish \
        --sequencing_summary $PWD/.github/data/nanopore/20200311_1427_X4_FAK72834_a3787181/sequencing_summary_FAK72834_298b7829.txt \
        --basecalled_fastq $PWD/.github/data/nanopore/20200311_1427_X4_FAK72834_a3787181/fastq_pass/ \
@@ -19,21 +23,22 @@ cp .nextflow.log artifacts/nanopolish_typing.nextflow.log
 rm -rf results && rm -rf work && rm -rf .nextflow*
 
 echo "Nextflow run current PR in --medaka mode with typing .." >> artifacts/test_artifact.log
-NXF_VER=20.03.0-edge nextflow run ./main.nf \
+nextflow run ./main.nf \
        -profile singularity \
        --medaka \
-       --gff $PWD/typing/MN908947.3.gff \
-       --yaml $PWD/typing/SARS-CoV-2.types.yaml \
+       --medaka_model r941_min_fast_g303 \
+       --gb $PWD/typing/NC_045512.2.gb \
+       --variant_definitions  $PWD/variant_definitions/variant_yaml \
        --basecalled_fastq $PWD/.github/data/nanopore/20200311_1427_X4_FAK72834_a3787181/fastq_pass/ \
        --prefix 20200311_1427_X4_FAK72834_a3787181
 cp .nextflow.log artifacts/medaka_typing.nextflow.log
 rm -rf results && rm -rf work && rm -rf .nextflow*
 
 echo Nextflow run current PR in --illumina mode with typing.. >> artifacts/test_artifact.log
-NXF_VER=20.03.0-edge nextflow run ./main.nf \
+nextflow run ./main.nf \
        -profile singularity \
-       --gff $PWD/typing/MN908947.3.gff \
-       --yaml $PWD/typing/SARS-CoV-2.types.yaml \
+       --gb $PWD/typing/NC_045512.2.gb \
+       --variant_definitions  $PWD/variant_definitions/variant_yaml \
        --directory $PWD/.github/data/fastqs/ \
        --illumina \
        --prefix test

diff --git a/.github/workflows/pull_request.yml b/.github/workflows/pull_request.yml
@@ -9,6 +9,7 @@ jobs:
     runs-on: ubuntu-latest
     env:
       NXF_ANSI_LOG: false
+      NXF_VER: 20.10.0
     steps:
     - uses: actions/checkout@master
     - name: create artifacts dir to save test logs
@@ -26,7 +27,7 @@ jobs:
       run: |
        export PATH=/opt/conda/bin:$PATH
        conda install -c bioconda nextflow
-       NXF_VER=20.03.0-edge nextflow -version
+       nextflow -version
     - name: test nanopore pipelines
       run: bash .github/scripts/test_nanopore_pipelines.sh
     - name: test typing functionality

diff --git a/.gitignore b/.gitignore
@@ -3,3 +3,4 @@ nextflow
 results
 *.sif
 work
+variant_definitions
diff --git a/conf/base.config b/conf/base.config
@@ -9,7 +9,8 @@ params{
     sequencing_summary = false
     ref = false
     bed = false
-    gff = false
+    gb = false
+    variant_definitions = false
     profile = false
 
     // Repo to download your primer scheme from

diff --git a/conf/dummyfile b/conf/dummyfile
diff --git a/conf/nanopore.config b/conf/nanopore.config
@@ -24,10 +24,6 @@ params {
     // After articGuppyPlex filter out samples with fewer than this number of reads
     minReadsArticGuppyPlex = 10
 
-    // Typing frequency threshold to call aa consequences of variant.
-    csqAfThreshold = 0.75
-
-    // Minimum coverage depth to call aa consequences of variant.
-    csqDpThreshold = 20
-
+    // Medaka model to use with --medaka
+    medaka_model = false
 }
diff --git a/environments/illumina/environment.yml b/environments/illumina/environment.yml
@@ -8,11 +8,13 @@ dependencies:
   - biopython=1.74
   - libxcb
   - matplotlib=3.3.3
+  - pip
   - pandas=0.23.0=py36_1
   - bwa=0.7.17=pl5.22.0_2
   - samtools=1.10
   - bcftools=1.10
   - trim-galore=0.6.5
   - ivar=1.3
-  - pyvcf=0.6.8
-  - pyyaml=5.3.1
+  - muscle=3.8.1551
+  - pip:
+    - git+https://github.com/connor-lab/aln2type
diff --git a/environments/nanopore/environment.yml b/environments/nanopore/environment.yml
@@ -4,5 +4,7 @@ channels:
   - bioconda
   - defaults
 dependencies:
-  - artic=1.1.3
-  - pyyaml=5.3.1
+  - artic=1.2.1
+  - pip
+  - pip:
+    - git+https://github.com/connor-lab/aln2type
diff --git a/main.nf b/main.nf
@@ -51,6 +51,10 @@ if ( params.illumina ) {
    if (! params.basecalled_fastq ) {
        println("Please supply a directory containing basecalled fastqs with --basecalled_fastq. This is the output directory from guppy_barcoder or guppy_basecaller - usually fastq_pass. This can optionally contain barcodeXX directories, which are auto-detected.")
    }
+   if (! params.medaka_model ) {
+       println("Please supply a medaka model with --medaka_model")
+       System.exit(1)
+   }
 } else {
        println("Please select a workflow with --nanopolish, --illumina or --medaka, or use --help to print help")
        System.exit(1)

diff --git a/modules/artic.nf b/modules/artic.nf
@@ -80,6 +80,7 @@ process articMinIONMedaka {
 
     """
     artic minion --medaka \
+    --medaka-model ${params.medaka_model} \
     ${minionFinalConfig} \
     --threads ${task.cpus} \
     --scheme-directory ${schemeRepo} \

diff --git a/modules/help.nf b/modules/help.nf
@@ -28,6 +28,7 @@ def printHelp() {
                               auto-detected and analysed in parallel.
       --fast5_pass            Directory containing fast5 files - usually fast5_pass. NOT REQUIRED FOR MEDAKA WORKFLOW.
       --sequencing_summary    Path to sequencing_summary.txt. NOT REQUIRED FOR MEDAKA WORKFLOW.
+      --medaka-model          Medaka model e.g. r941_min_fast_g303 NOT REQUIRED FOR NANOPOLISH WORKFLOW.
 
     Optional:
       --outdir                Output directory (Default: ./results)
@@ -43,10 +44,11 @@ def printHelp() {
       --outCram               Output cram instead of bam files (Default: false)
       --minReadsPerBarcode    Minimum number of reads accepted for a single barcode when supplying deplexed Fastq
                               files as input. Barcodes having fewer reads are ignored. (Default: 100)
-
-      --gff                   Path to annotation gff for variant consequence calling and typing. (Default: unset, don't run typing unless set)
-      --yaml                  Path to YAML file with typing schemes.
-                              Format: { <typing_scheme_name> : { coverage: <float>, variants: <gene_name>: <[ D614G, IHV68I ]> }}
+
+      --variant_definitions   Path to variant_definitions directory from https://github.com/phe-genomics/variant_definitions.git.
+                              Must point to the directory that contains *.yml and not its parent.
+      --gb                    Path to GenBank file to generate AA consequences of mutations [NC_045512]
+
 
 
   Illumina workflow options:
@@ -67,9 +69,11 @@ def printHelp() {
                               Overrides --scheme* options. (Default: unset, download scheme from git)
       --ref                   Path to iVar-compatible reference fasta file, also requires --bed 
                               Overrides --scheme* options. (Default: unset, download scheme from git)
-      --gff                   Path to annotation gff for variant consequence calling and typing. (Default: unset, typing not run unless set)
-      --yaml                  Path to YAML file with typing schemes. 
-                              Format: { <typing_scheme_name> : { coverage: <float>, variants: <gene_name>: <[ D614G, IHV68I ]> }}
+
+      --variant_definitions   Path to variant_definitions directory from https://github.com/phe-genomics/variant_definitions.git.
+                              Must point to the directory that contains *.yml and not its parent.
+      --gb                    Path to GenBank file to generate AA consequences of mutations [NC_045512]
+
       --allowNoprimer         Allow reads that don't have primer sequence? 
                               Depends on your library prep method: ligation == false, tagmentation == true (Default: true)
       --illuminaKeepLen       Length (bp) of reads to keep after primer trimming (Default: 20)

diff --git a/modules/typing.nf b/modules/typing.nf
@@ -1,47 +1,47 @@
+process alignSeqs {
+    publishDir "${params.outdir}/${task.process.replaceAll(":","_")}/msa", mode: 'copy', overwrite: false, pattern: "${sampleName}.muscle.aln"
+
+    tag { sampleName }
 
-process typeVariants {
+    input:
+      tuple sampleName, path(sample), path(reference)
 
-    tag { sampleName }
+    output:
+      tuple sampleName, path("${sampleName}.muscle.aln")
 
-    publishDir "${params.outdir}/${task.process.replaceAll(":","_")}/variants", pattern: "${sampleName}.variants.csv", mode: 'copy'
-    publishDir "${params.outdir}/${task.process.replaceAll(":","_")}/vcf", pattern: "${sampleName}.csq.vcf", mode: 'copy'
-    publishDir "${params.outdir}/${task.process.replaceAll(":","_")}/typing", pattern: "${sampleName}.typing.csv", mode: 'copy'
+    script:
+      """
+      sed "s/>.*/>${sampleName}/g" $sample > ${sampleName}.clean.fa
+      cat $reference ${sampleName}.clean.fa > pre.aln
+      muscle -in pre.aln -out ${sampleName}.muscle.aln
+      """
+}
+
+process typeVariants {
+    publishDir "${params.outdir}/${task.process.replaceAll(":","_")}/typing_json", mode: 'copy', overwrite: true, pattern: "${sampleName}.json.gz"
+    publishDir "${params.outdir}/${task.process.replaceAll(":","_")}/variant_csv", mode: 'copy', overwrite: true, pattern: "${sampleName}.csv"
+
+    tag { sampleName }
 
     input:
-    tuple sampleName, path(variants), path(gff), path(ref), path(yaml)
+      tuple sampleName, refName, path(msa), path(yaml_dir), path(gb)
 
     output:
-    path "${sampleName}.variants.csv", optional: true, emit: variants_csv
-    path "${sampleName}.typing.csv", optional: true, emit: typing_csv
-    path "${sampleName}.csq.vcf", emit: csq_vcf
+      path("aln2type.${sampleName}.csv"), emit: typing_csv optional true
+      path("${sampleName}.csv"), emit: variants_csv optional true
+      path("${sampleName}.json.gz") optional true
 
     script:
-    if( params.illumina )
+      if ( gb.getBaseName() != 'dummyfile' ){
         """
-        type_vcf.py \
-        -i ${sampleName} \
-        -y ${yaml} \
-        -ov ${sampleName}.csq.vcf \
-        -ot ${sampleName}.typing.csv \
-        -os ${sampleName}.variants.csv \
-        -dp ${params.csqDpThreshold} \
-        -af ${params.csqAfThreshold} \
-        -t ${variants} \
-        ${gff} ${ref}
+        aln2type --gb ${gb} --output_unclassified . . aln2type.${sampleName}.csv ${refName} ${msa} ${yaml_dir}/*.yml
         """
-    else
+      } else {
         """
-        type_vcf.py \
-        -i ${sampleName} \
-        -y ${yaml} \
-        -ov ${sampleName}.csq.vcf \
-        -ot ${sampleName}.typing.csv \
-        -os ${sampleName}.variants.csv \
-        -dp ${params.csqDpThreshold} \
-        -af ${params.csqAfThreshold} \
-        -v ${variants} \
-        ${gff} ${ref}
+        aln2type --output_unclassified . . aln2type.${sampleName}.csv ${refName} ${msa} ${yaml_dir}/*.yml
         """
+      }
+
 }
 
 process mergeTypingCSVs {

diff --git a/nextflow.config b/nextflow.config
@@ -95,7 +95,7 @@ manifest {
   author = 'Matt Bull'
   description = 'Nextflow for running the Artic ncov2019 pipeline'
   mainScript = 'main.nf'
-  nextflowVersion = '>=20.01.0'
+  nextflowVersion = '!>=20.10.0'
   version = '0.1.0'
 }
-Original file line number
+Diff line change
@@ Expand Up / @@ -3,3 +3,4 @@ nextflow @@
     results
     *.sif
     work
+    variant_definitions