From 7072d598ca2b600aa9603bb14bc3228b2e17e531 Mon Sep 17 00:00:00 2001 From: adamrtalbot <12817534+adamrtalbot@users.noreply.github.com> Date: Sat, 2 Nov 2024 12:26:01 +0000 Subject: [PATCH] Adds fq/lint for early validation of FASTQs Validation of FASTQS early prevents running the pipeline on invalid FASTQ files which will make the pipeline more efficient at achieving it's ultimate objective of checking FASTQ validity. It adds 3 more parameters: - `--skip_linting` which enables the linting of FASTQs - `--fq_lint_args` which is a string of arguments to pass to the linting tool - `--continue_with_lint_fail` which is a boolean to determine whether to continue if the linting fails Between these three options the user has a high degree of control over how the pipeline lints which should handle most use cases. Closes #31 --- CHANGELOG.md | 1 + CITATIONS.md | 2 + README.md | 5 +- conf/modules.config | 9 ++ modules.json | 49 ++++++++--- modules/nf-core/fq/lint/environment.yml | 5 ++ modules/nf-core/fq/lint/main.nf | 33 +++++++ modules/nf-core/fq/lint/meta.yml | 43 +++++++++ modules/nf-core/fq/lint/tests/main.nf.test | 63 ++++++++++++++ .../nf-core/fq/lint/tests/main.nf.test.snap | 25 ++++++ modules/nf-core/fq/lint/tests/tags.yml | 2 + nextflow.config | 7 ++ nextflow_schema.json | 28 +++++- tests/rnaseq.main.nf.test | 87 +++++++++++++++++++ tests/rnaseq.main.nf.test.config | 8 ++ workflows/seqinspector.nf | 19 ++++ 16 files changed, 372 insertions(+), 14 deletions(-) create mode 100644 modules/nf-core/fq/lint/environment.yml create mode 100644 modules/nf-core/fq/lint/main.nf create mode 100644 modules/nf-core/fq/lint/meta.yml create mode 100644 modules/nf-core/fq/lint/tests/main.nf.test create mode 100644 modules/nf-core/fq/lint/tests/main.nf.test.snap create mode 100644 modules/nf-core/fq/lint/tests/tags.yml create mode 100644 tests/rnaseq.main.nf.test create mode 100644 tests/rnaseq.main.nf.test.config diff --git a/CHANGELOG.md b/CHANGELOG.md index 91ee0b7..ea20eb3 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -17,6 +17,7 @@ Initial release of nf-core/seqinspector, created with the [nf-core](https://nf-c - [#51](https://github.com/nf-core/seqinspector/pull/51) Add nf-test to CI. - [#63](https://github.com/nf-core/seqinspector/pull/63) Contribution guidelines added about displaying results for new tools - [#53](https://github.com/nf-core/seqinspector/pull/53) Add FastQ-Screen database multiplexing and limit scope of nf-test in CI. +- [#67](https://github.com/nf-core/seqinspector/pull/67) Add FASTQ linting for early validation ### `Fixed` diff --git a/CITATIONS.md b/CITATIONS.md index 208cfa1..e1811c7 100644 --- a/CITATIONS.md +++ b/CITATIONS.md @@ -10,6 +10,8 @@ ## Pipeline tools +- [FQ](https://github.com/stjude-rust-labs/fq) + - [FastQC](https://www.bioinformatics.babraham.ac.uk/projects/fastqc/) > Andrews, S. (2010). FastQC: A Quality Control Tool for High Throughput Sequence Data [Online]. diff --git a/README.md b/README.md index 6cf36dc..6f25df4 100644 --- a/README.md +++ b/README.md @@ -31,9 +31,10 @@ workflows use the "tube map" design for that. See https://nf-co.re/docs/contributing/design_guidelines#examples for examples. --> +1. Lint FASTQs with ([`fq`](https://github.com/stjude-rust-labs/fq)) 1. Subsample reads ([`Seqtk`](https://github.com/lh3/seqtk)) -2. Read QC ([`FastQC`](https://www.bioinformatics.babraham.ac.uk/projects/fastqc/)) -3. Present QC for raw reads ([`MultiQC`](http://multiqc.info/)) +1. Read QC ([`FastQC`](https://www.bioinformatics.babraham.ac.uk/projects/fastqc/)) +1. Present QC for raw reads ([`MultiQC`](http://multiqc.info/)) ## Usage diff --git a/conf/modules.config b/conf/modules.config index 4a653ed..4e4f33f 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -18,6 +18,15 @@ process { saveAs: { filename -> filename.equals('versions.yml') ? null : filename } ] + withName: 'FQ_LINT' { + ext.args = { params.fq_lint_args } + errorStrategy = { + task.exitStatus in ((130..145) + 104) ? 'retry' : + params.continue_with_lint_fail ? 'ignore' : + 'finish' + } + } + withName: SEQTK_SAMPLE { ext.args = '-s100' } diff --git a/modules.json b/modules.json index 01ae911..00a9f11 100644 --- a/modules.json +++ b/modules.json @@ -8,38 +8,59 @@ "bowtie2/build": { "branch": "master", "git_sha": "666652151335353eef2fcd58880bcef5bc2928e1", - "installed_by": ["modules"] + "installed_by": [ + "modules" + ] }, "fastqc": { "branch": "master", "git_sha": "08108058ea36a63f141c25c4e75f9f872a5b2296", - "installed_by": ["modules"] + "installed_by": [ + "modules" + ] }, "fastqscreen/buildfromindex": { "branch": "master", "git_sha": "666652151335353eef2fcd58880bcef5bc2928e1", - "installed_by": ["modules"] + "installed_by": [ + "modules" + ] }, "fastqscreen/fastqscreen": { "branch": "master", "git_sha": "666652151335353eef2fcd58880bcef5bc2928e1", - "installed_by": ["modules"], + "installed_by": [ + "modules" + ], "patch": "modules/nf-core/fastqscreen/fastqscreen/fastqscreen-fastqscreen.diff" }, + "fq/lint": { + "branch": "master", + "git_sha": "a1abf90966a2a4016d3c3e41e228bfcbd4811ccc", + "installed_by": [ + "modules" + ] + }, "multiqc": { "branch": "master", "git_sha": "cf17ca47590cc578dfb47db1c2a44ef86f89976d", - "installed_by": ["modules"] + "installed_by": [ + "modules" + ] }, "seqfu/stats": { "branch": "master", "git_sha": "666652151335353eef2fcd58880bcef5bc2928e1", - "installed_by": ["modules"] + "installed_by": [ + "modules" + ] }, "seqtk/sample": { "branch": "master", "git_sha": "666652151335353eef2fcd58880bcef5bc2928e1", - "installed_by": ["modules"] + "installed_by": [ + "modules" + ] } } }, @@ -48,20 +69,26 @@ "utils_nextflow_pipeline": { "branch": "master", "git_sha": "c2b22d85f30a706a3073387f30380704fcae013b", - "installed_by": ["subworkflows"] + "installed_by": [ + "subworkflows" + ] }, "utils_nfcore_pipeline": { "branch": "master", "git_sha": "51ae5406a030d4da1e49e4dab49756844fdd6c7a", - "installed_by": ["subworkflows"] + "installed_by": [ + "subworkflows" + ] }, "utils_nfschema_plugin": { "branch": "master", "git_sha": "2fd2cd6d0e7b273747f32e465fdc6bcc3ae0814e", - "installed_by": ["subworkflows"] + "installed_by": [ + "subworkflows" + ] } } } } } -} +} \ No newline at end of file diff --git a/modules/nf-core/fq/lint/environment.yml b/modules/nf-core/fq/lint/environment.yml new file mode 100644 index 0000000..74b1460 --- /dev/null +++ b/modules/nf-core/fq/lint/environment.yml @@ -0,0 +1,5 @@ +channels: + - conda-forge + - bioconda +dependencies: + - bioconda::fq=0.12.0 diff --git a/modules/nf-core/fq/lint/main.nf b/modules/nf-core/fq/lint/main.nf new file mode 100644 index 0000000..943314c --- /dev/null +++ b/modules/nf-core/fq/lint/main.nf @@ -0,0 +1,33 @@ +process FQ_LINT { + tag "$meta.id" + label 'process_low' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/fq:0.12.0--h9ee0642_0': + 'biocontainers/fq:0.12.0--h9ee0642_0' }" + + input: + tuple val(meta), path(fastq) + + output: + tuple val(meta), path("*.fq_lint.txt"), emit: lint + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + """ + fq lint \\ + $args \\ + $fastq > ${prefix}.fq_lint.txt + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + fq: \$(echo \$(fq lint --version | sed 's/fq-lint //g')) + END_VERSIONS + """ +} diff --git a/modules/nf-core/fq/lint/meta.yml b/modules/nf-core/fq/lint/meta.yml new file mode 100644 index 0000000..7240fb5 --- /dev/null +++ b/modules/nf-core/fq/lint/meta.yml @@ -0,0 +1,43 @@ +name: "fq_lint" +description: fq lint is a FASTQ file pair validator. +keywords: + - lint + - fastq + - validate +tools: + - "fq": + description: "fq is a library to generate and validate FASTQ file pairs." + homepage: "https://github.com/stjude-rust-labs/fq" + documentation: "https://github.com/stjude-rust-labs/fq" + tool_dev_url: "https://github.com/stjude-rust-labs/fq" + licence: ["MIT"] + identifier: "" +input: + - - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - fastq: + type: file + description: FASTQ file list + pattern: "*.fastq{,.gz}" +output: + - lint: + - meta: + type: file + description: Lint output + pattern: "*.fq_lint.txt" + - "*.fq_lint.txt": + type: file + description: Lint output + pattern: "*.fq_lint.txt" + - versions: + - versions.yml: + type: file + description: File containing software versions + pattern: "versions.yml" +authors: + - "@adamrtalbot" +maintainers: + - "@adamrtalbot" diff --git a/modules/nf-core/fq/lint/tests/main.nf.test b/modules/nf-core/fq/lint/tests/main.nf.test new file mode 100644 index 0000000..ec2eaf8 --- /dev/null +++ b/modules/nf-core/fq/lint/tests/main.nf.test @@ -0,0 +1,63 @@ +nextflow_process { + + name "Test Process FQ_LINT" + script "../main.nf" + process "FQ_LINT" + + tag "modules" + tag "modules_nfcore" + tag "fq" + tag "fq/lint" + + test("test_fq_lint_success") { + when { + params { + outdir = "$outputDir" + } + process { + """ + input[0] = [ [ id:'test', single_end:false ], // meta map + [ file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/fastq/test_1.fastq.gz', checkIfExists: true), + file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/fastq/test_2.fastq.gz', checkIfExists: true) ] + ] + """ + } + } + + then { + assertAll ( + { assert process.success }, + { assert process.out.lint.get(0).get(1) ==~ ".*/test.fq_lint.txt" }, + { assert path(process.out.lint.get(0).get(1)).getText().contains("fq-lint start") }, + { assert path(process.out.lint.get(0).get(1)).getText().contains("read 100 records") }, + { assert path(process.out.lint.get(0).get(1)).getText().contains("fq-lint end") }, + ) + } + + } + + test("test_fq_lint_fail") { + when { + params { + outdir = "$outputDir" + } + process { + """ + input[0] = [ [ id:'test', single_end:false ], // meta map + [ file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/fastq/test_1.fastq.gz', checkIfExists: true), + file(params.modules_testdata_base_path + 'genomics/prokaryotes/candidatus_portiera_aleyrodidarum/illumina/fastq/test_2.fastq.gz', checkIfExists: true) ] + ] + """ + } + } + + then { + assertAll ( + { assert !process.success }, + { assert snapshot(process.out).match() }, + ) + } + + } + +} diff --git a/modules/nf-core/fq/lint/tests/main.nf.test.snap b/modules/nf-core/fq/lint/tests/main.nf.test.snap new file mode 100644 index 0000000..fec8e52 --- /dev/null +++ b/modules/nf-core/fq/lint/tests/main.nf.test.snap @@ -0,0 +1,25 @@ +{ + "test_fq_lint_fail": { + "content": [ + { + "0": [ + + ], + "1": [ + + ], + "lint": [ + + ], + "versions": [ + + ] + } + ], + "meta": { + "nf-test": "0.9.0", + "nextflow": "24.04.4" + }, + "timestamp": "2024-10-19T16:37:02.133847389" + } +} \ No newline at end of file diff --git a/modules/nf-core/fq/lint/tests/tags.yml b/modules/nf-core/fq/lint/tests/tags.yml new file mode 100644 index 0000000..9c9c323 --- /dev/null +++ b/modules/nf-core/fq/lint/tests/tags.yml @@ -0,0 +1,2 @@ +fq/lint: + - modules/nf-core/fq/lint/** diff --git a/nextflow.config b/nextflow.config index 9ab822e..a9b27a6 100644 --- a/nextflow.config +++ b/nextflow.config @@ -13,6 +13,13 @@ params { // Input options input = null sample_size = 0 + + // Options + skip_linting = false + fq_lint_args = "" + continue_with_lint_fail = false + + // References genome = null fasta = null diff --git a/nextflow_schema.json b/nextflow_schema.json index 3f84164..77af3b5 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -31,7 +31,6 @@ }, "outdir": { "type": "string", - "default": null, "format": "directory-path", "description": "The output directory where the results will be saved. You have to use absolute paths to storage on Cloud infrastructure.", "fa_icon": "fas fa-folder-open" @@ -50,6 +49,30 @@ } } }, + "validation_options": { + "title": "Validation options", + "type": "object", + "description": "Options for validating and screening FASTQ files.", + "default": "", + "properties": { + "skip_linting": { + "type": "boolean", + "default": false, + "description": "Whether to lint the FASTQs before performing QC on the sequences", + "help_text": "FASTQ files will be linted with FQ early in the pipeline. If they fail validation, the pipeline will terminate preventing expensive quality control steps being performed on the other samples. If ignoring FQ is enabled, quality control will be performed on the remaining samples." + }, + "fq_lint_args": { + "type": "string", + "description": "Arguments to pass to FQ lint", + "help_text": "Arguments to pass to FQ lint. This can be used to disable overly strict linting. See https://github.com/stjude-rust-labs/fq?tab=readme-ov-file#lint for more information." + }, + "continue_with_lint_fail": { + "type": "boolean", + "description": "Whether to continue with the pipeline if linting fails for a single sample.", + "help_text": "If set to true, the pipeline will continue with the remaining samples if linting fails for a single sample. If set to false, the pipeline will terminate if linting fails for a single sample." + } + } + }, "reference_genome_options": { "title": "Reference genome options", "type": "object", @@ -245,6 +268,9 @@ { "$ref": "#/$defs/input_output_options" }, + { + "$ref": "#/$defs/validation_options" + }, { "$ref": "#/$defs/reference_genome_options" }, diff --git a/tests/rnaseq.main.nf.test b/tests/rnaseq.main.nf.test new file mode 100644 index 0000000..411e58c --- /dev/null +++ b/tests/rnaseq.main.nf.test @@ -0,0 +1,87 @@ +nextflow_pipeline { + + name "Test Workflow main.nf on NovaSeq6000 data" + script "../main.nf" + tag "seqinspector" + tag "PIPELINE" + + test("rnaseq data test fail linting") { + + when { + config "./rnaseq.main.nf.test.config" + params { + outdir = "$outputDir" + } + } + + then { + assertAll( + // Linting should fail! + { assert workflow.failed } + ) + } + } + + test("rnaseq data test skip linting") { + + when { + config "./rnaseq.main.nf.test.config" + params { + outdir = "$outputDir" + skip_linting = true + } + } + + then { + assertAll( + { assert workflow.success } + ) + } + } + + test("rnaseq data test ignore linting") { + + when { + config "./rnaseq.main.nf.test.config" + params { + outdir = "$outputDir" + continue_with_lint_fail = true + } + } + + then { + assertAll( + { assert workflow.success }, + { assert snapshot( + path("$outputDir/multiqc/global_report/multiqc_data/multiqc_citations.txt"), + path("$outputDir/multiqc/global_report/multiqc_data/multiqc_fastqc.txt"), + path("$outputDir/multiqc/global_report/multiqc_data/multiqc_general_stats.txt") + ) + }, + ) + } + } + + test("rnaseq data test add args to fq/lint") { + + when { + config "./rnaseq.main.nf.test.config" + params { + outdir = "$outputDir" + fq_lint_args = "--disable-validator P001" + } + } + + then { + assertAll( + { assert workflow.success }, + { assert snapshot( + path("$outputDir/multiqc/global_report/multiqc_data/multiqc_citations.txt"), + path("$outputDir/multiqc/global_report/multiqc_data/multiqc_fastqc.txt"), + path("$outputDir/multiqc/global_report/multiqc_data/multiqc_general_stats.txt") + ) + }, + ) + } + } +} diff --git a/tests/rnaseq.main.nf.test.config b/tests/rnaseq.main.nf.test.config new file mode 100644 index 0000000..f85acb5 --- /dev/null +++ b/tests/rnaseq.main.nf.test.config @@ -0,0 +1,8 @@ +// Load the basic test config +includeConfig 'nextflow.config' + +// Load the correct samplesheet for that test +params { + input = params.pipelines_testdata_base_path + '626c8fab639062eade4b10747e919341cbf9b41a/samplesheet/v3.10/samplesheet_test.csv' + +} diff --git a/workflows/seqinspector.nf b/workflows/seqinspector.nf index edce542..f877516 100644 --- a/workflows/seqinspector.nf +++ b/workflows/seqinspector.nf @@ -7,6 +7,7 @@ include { samplesheetToList } from 'plugin/nf-schema' */ include { SEQTK_SAMPLE } from '../modules/nf-core/seqtk/sample/main' +include { FQ_LINT } from '../modules/nf-core/fq/lint/main' include { FASTQC } from '../modules/nf-core/fastqc/main' include { SEQFU_STATS } from '../modules/nf-core/seqfu/stats' include { FASTQSCREEN_FASTQSCREEN } from '../modules/nf-core/fastqscreen/fastqscreen/main' @@ -37,6 +38,24 @@ workflow SEQINSPECTOR { ch_multiqc_extra_files = Channel.empty() ch_multiqc_reports = Channel.empty() + // + // MODULE: Run FQ_LINT to catch early errors + // + if ( !params.skip_linting ) { + FQ_LINT ( + ch_samplesheet + ) + ch_versions = ch_versions.mix(FQ_LINT.out.versions.first()) + // This catches all FASTQs that pass linting + // If you use an error strategy that allows FQ_LINT to fail, + // only valid FASTQ files will be passed to the next module + ch_samplesheet = FQ_LINT.out.lint + .join(ch_samplesheet) + .map { meta, fq_lint, reads -> + [meta, reads] + } + } + // // MODULE: Run Seqtk sample to perform subsampling //