From bdfc557a64cecc19d1d86eead8bdb691a1ff2166 Mon Sep 17 00:00:00 2001 From: Max Date: Tue, 5 Nov 2024 14:55:56 +0100 Subject: [PATCH] feat!: latest development for new release (#133) * chore: Update development (#128) * docs: enhancing documentation * docs: better quickstart * chore: ubdate github actions to setup-micromamba * docs: remove default channel from environment file * docs: improvements, like QC report (#125) * added .DS_Store to gitignore. * Fixed the overflow of the features section by using the table. * Fixed the broked report link. * Sample QC report HTML file * Added the link to the QC report in experiment. * Added the assignment QC report. * Add link to QC report in assignment documentation * Update documentation in quickstart.rst. Fixed typos and gramatical mistakes. * Update documentation in index.rst. Fix typos and grammatical mistakes. * Fix typo in installation documentation * Refactor documentation in config.rst --------- Co-authored-by: Max * docs: Fixed the link for the QC report in Experiment and Assignment (#126) * added .DS_Store to gitignore. * Fixed the overflow of the features section by using the table. * Fixed the broked report link. * fixed typo project * Typo fix controlled * Sample QC report HTML file * Added the link to the QC report in experiment. * Added the assignment QC report. * Add link to QC report in assignment documentation * Update documentation in quickstart.rst. Fixed typos and gramatical mistakes. * Update documentation in index.rst. Fix typos and grammatical mistakes. * Fix typo in installation documentation * Refactor documentation in config.rst * Update documentation links in assignment.rst and experiment.rst * Testing the iframe html file. * Update documentation links in assignment.rst and experiment.rst --------- Co-authored-by: Max * chore: delete not necessary files * docs: automatic versioning * style: automatic version printing of MPRAsnakeflow * fix: memory resources for bbmap (#123) * fix: add memory resources for bbmap * set lower memm in bbmap workflow profile * increasing memory for bmap --------- Co-authored-by: Max Schubach Co-authored-by: Max Schubach * fix: Detach from anaconda (#122) * fix: detach from anaconda. Remove defaults conda channels * fixing linting errors * update hashes in dockerfile from lining errors --------- Co-authored-by: Max Schubach * chore(master): release MPRAsnakeflow 0.1.1 (#124) * chore(master): release MPRAsnakeflow 0.1.1 * Update .release-please-manifest.json * Update version.txt * Update CHANGELOG.md --------- Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com> Co-authored-by: Max * forgot to upgrade two envs * docs: correct link in docs badge --------- Co-authored-by: Max Schubach Co-authored-by: Ali <69039717+bioinformaticsguy@users.noreply.github.com> Co-authored-by: Max Schubach Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com> * feat!: igvf outputs (#129) * refactor: removed statistics from final barcode to oligo map * refactor outputs * fix scripts due to renaming headers * fix assignment statistic due to new output * refactor!: moving files. not attched counts are not used as well as median for scaling * adding logs --------- Co-authored-by: Max Schubach * chore!: supporting only snakemake >=8.24.1 (#130) Co-authored-by: Max Schubach * refactor!: No min max length for bbmap. default mapq is 30. (#131) Changes for bbmap * no min an max for sequence length and start. (like exact matching) * using default of 30 mapq instead of 35 * feat!: outlier removal (#132) * feat!: outlier detection Might break older config files * docs: update documentation for bbmap, apptainer and outlier removal * use abs for zscore * trying to fix outlier via zscore * mad code change * change outlier removal default to zscore --------- Co-authored-by: Max Schubach * edit config --------- Co-authored-by: Max Schubach Co-authored-by: Ali <69039717+bioinformaticsguy@users.noreply.github.com> Co-authored-by: Max Schubach Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com> --- .gitignore | 5 +- README.md | 13 +- config/example_assignment_bbmap.yaml | 10 +- config/example_config.yaml | 4 +- docs/assignment.rst | 2 +- docs/cluster.rst | 6 +- docs/config.rst | 34 ++-- docs/experiment.rst | 2 +- docs/index.rst | 12 +- docs/install.rst | 10 +- resources/assoc_basic/config.yml | 8 +- resources/combined_basic/config.yml | 8 +- resources/count_basic/config.yml | 8 + workflow/Snakefile | 20 +- workflow/rules/assigned_counts.smk | 96 ++++++++- workflow/rules/assignment.smk | 18 +- workflow/rules/assignment/statistic.smk | 2 +- workflow/rules/common.smk | 39 +++- workflow/rules/statistic/assigned_counts.smk | 4 +- workflow/rules/statistic/bc_overlap.smk | 12 +- workflow/rules/statistic/correlation.smk | 20 +- workflow/rules/variants.smk | 4 +- workflow/schemas/config.schema.yaml | 79 +++---- .../scripts/assignment/statistic_assignment.R | 82 ++++---- workflow/scripts/count/combine_replicates.py | 22 +- workflow/scripts/count/make_master_tables.R | 156 ++++++++------ workflow/scripts/count/merge_label.py | 192 ++++++++++++++---- .../count/merge_replicates_barcode_counts.py | 131 ++++++------ .../count/plot_perInsertCounts_correlation.R | 14 +- .../count/plot_perInsertCounts_stats.R | 19 +- .../variants/correlateVariantTables.py | 2 +- .../variants/generateMasterVariantTable.py | 28 +-- .../scripts/variants/generateVariantTable.py | 8 +- 33 files changed, 671 insertions(+), 399 deletions(-) diff --git a/.gitignore b/.gitignore index 38d5e812..9267b83a 100644 --- a/.gitignore +++ b/.gitignore @@ -7,6 +7,9 @@ logs !config/* !resources !resources/** +resources/**/.local +resources/**/.cache +resources/**/.ipython !workflow !workflow/** !.gitattributes @@ -27,4 +30,4 @@ mix_data *report.html *.simg *results -.DS_Store \ No newline at end of file +.DS_Store diff --git a/README.md b/README.md index 2268cf72..bb7fd326 100644 --- a/README.md +++ b/README.md @@ -1,7 +1,7 @@ # Snakemake workflow: MPRAsnakeflow [![Documentation Status](https://readthedocs.org/projects/mprasnakeflow/badge/?version=latest)](https://mprasnakeflow.readthedocs.io/latest/?badge=latest) -[![Snakemake](https://img.shields.io/badge/snakemake-≥7.2.1-brightgreen.svg)](https://snakemake.bitbucket.io) +[![Snakemake](https://img.shields.io/badge/snakemake-≥8.24.1-brightgreen.svg)](https://snakemake.github.io/) [![Tests](https://github.com/kircherlab/MPRAsnakeflow/actions/workflows/main.yml/badge.svg)](https://github.com/kircherlab/MPRAsnakeflow/actions/workflows/main.yml) This pipeline processes sequencing data from Massively Parallel Reporter Assays (MPRA) to create count tables for candidate sequences tested in the experiment. @@ -33,9 +33,9 @@ Create or adjust the `config/example_config.yaml` in the repository to your need ### Step 3: Install Snakemake -Install Snakemake (recommended version >= 8.x) using [conda](https://conda.io/projects/conda/en/latest/user-guide/install/index.html) or [mamba](https://mamba.readthedocs.io/en/latest/installation/mamba-installation.html) (recommended installation via [miniforge](https://github.com/conda-forge/miniforge)): +Install Snakemake (version >= 8.24.1) using [conda >24.7.1](https://conda.io/projects/conda/en/latest/user-guide/install/index.html) (recommended installation via [miniforge](https://github.com/conda-forge/miniforge)): - mamba create -c bioconda -n snakemake snakemake + conda create -c bioconda -n snakemake snakemake For installation details, see the [instructions in the Snakemake documentation](https://snakemake.readthedocs.io/en/stable/getting_started/installation.html). @@ -43,7 +43,7 @@ For installation details, see the [instructions in the Snakemake documentation]( Activate the conda environment: - mamba activate snakemake + conda activate snakemake Test your configuration by performing a dry-run via @@ -58,9 +58,6 @@ using `$N` cores or run it in a cluster environment (here SLURM) via the [slurm snakemake --software-deployment-method conda --executor slurm --cores $N --configfile config.yaml --workflow-profile profiles/default Please note that `profiles/default/config.yaml` has to be adapted to your needs (like partition names). -For snakemake 7.x this might work too using slurm sbatch (but depricated in newer snakemake versions: - - snakemake --use-conda --configfile config.yaml --cluster "sbatch --nodes=1 --ntasks={cluster.threads} --mem={cluster.mem} -t {cluster.time} -p {cluster.queue} -o {cluster.output}" --jobs 100 --cluster-config config/sbatch.yaml Please note that the log folder of the cluster environment has to be generated first, e.g: @@ -71,7 +68,7 @@ For other cluster environments please check the [Snakemake](https://snakemake.re If you not only want to fix the software stack but also the underlying OS, use - snakemake --sdm apptainer,conda --cores $N --configfile config.yaml --workflow-profile profiles/default + snakemake --sdm apptainer conda --cores $N --configfile config.yaml --workflow-profile profiles/default in combination with any of the modes above. This will use a pre-build singularity container of MPRAsnakeflow with the conda ens installed in. diff --git a/config/example_assignment_bbmap.yaml b/config/example_assignment_bbmap.yaml index aa48288f..04596867 100644 --- a/config/example_assignment_bbmap.yaml +++ b/config/example_assignment_bbmap.yaml @@ -8,13 +8,9 @@ assignments: alignment_tool: tool: bbmap configs: - min_mapping_quality: 35 # integer >=0. 35 is default - sequence_length: # sequence length of design excluding adapters. - min: 166 - max: 175 - alignment_start: # start of an alignment in the reference/design_file. Here using 15 bp adapters. Can be different when using adapter free approaches - min: 1 # integer - max: 3 # integer + min_mapping_quality: 30 # 30 is default for bbmap + sequence_length: 171 # sequence length of design excluding adapters. + alignment_start: 1 # start of an alignment in the reference/design_file. Here using 15 bp adapters. Can be different when using adapter free approaches FW: - resources/Assignment_BasiC/R1.fastq.gz BC: diff --git a/config/example_config.yaml b/config/example_config.yaml index 3da3db5f..85a45d22 100644 --- a/config/example_config.yaml +++ b/config/example_config.yaml @@ -6,9 +6,9 @@ assignments: exampleAssignment: # name of an example assignment (can be any string) bc_length: 15 alignment_tool: - tool: exact # bbbmap, bwa or exact + tool: exact # bbmap, bwa or exact configs: - sequence_length: 170 # sequence length of design excluding adapters. + sequence_length: 171 # sequence length of design excluding adapters. alignment_start: 1 # start of the alignment in the reference/design_file FW: - resources/assoc_basic/data/SRR10800986_1.fastq.gz diff --git a/docs/assignment.rst b/docs/assignment.rst index 3c57526a..1fb3ad42 100644 --- a/docs/assignment.rst +++ b/docs/assignment.rst @@ -73,7 +73,7 @@ Mandatory arguments: :\-\-configfile: Specify or overwrite the config file of the workflow (see the docs). Values specified in JSON or YAML format are available in the global config dictionary inside the workflow. Multiple files overwrite each other in the given order. Thereby missing keys in previous config files are extended by following configfiles. Note that this order also includes a config file defined in the workflow definition itself (which will come first). (default: None) :\-\-sdm: - **Required to run MPRAsnakeflow.** : :code:`--sdm conda` or :code:`--sdm apptainer` Uses the defined conda environment per rule. We highly recommend to use apptainer where we build a predefined docker container with all software installewd within it. :code:`--sdm conda` teh conda envs will be installed by the first excecution of the workflow. If this flag is not set, the conda/apptainer directive is ignored. (default: False) + **Required to run MPRAsnakeflow.** : :code:`--sdm conda` or :code:`--sdm apptainer conda` Uses the defined conda environment per rule. We highly recommend to use apptainer where we build a predefined docker container with all software installewd within it. :code:`--sdm conda` teh conda envs will be installed by the first excecution of the workflow. If this flag is not set, the conda/apptainer directive is ignored. (default: False) Recommended arguments: :\-\-snakefile: You should not need to specify this. By default, Snakemake will search for 'Snakefile', 'snakefile', 'workflow/Snakefile','workflow/snakefile' beneath the current working directory, in this order. Only if you definitely want a different layout, you need to use this parameter. This is very usefull when you want to have the results in a different folder than MPRAsnakeflow is in. (default: None) diff --git a/docs/cluster.rst b/docs/cluster.rst index 0b8028a7..64c42880 100644 --- a/docs/cluster.rst +++ b/docs/cluster.rst @@ -32,10 +32,10 @@ Using the slurm excecutor plugin running 300 jobs in parallel. snakemake --sdm conda --configfile config/config.yaml -j 300 --workflow-profile profiles/default --executor slurm -Snakemake 7 ------------ +Snakemake 7 (not supported anymore) +------------------------------------- -Here we used the :code:`--cluster` option which is not anymo,onger available in snakemake 8. You can also use the predefined `config/sbatch.yaml` but this might be outdated and we highly recommend to use resources with the workfloe profile. +Here we used the :code:`--cluster` option which is not available in snakemake 8. You can also use the predefined `config/sbatch.yaml` but this might be outdated and we highly recommend to use resources with the workfloe profile. .. code-block:: bash diff --git a/docs/config.rst b/docs/config.rst index d5c72033..fbf5d133 100644 --- a/docs/config.rst +++ b/docs/config.rst @@ -47,7 +47,7 @@ For each assignment you want to process you have to give him a name like :code:` Alignment tool configuration that is used to map the reads to the oligos. :tool: - Alignment tool that is used. Currently :code:`bwa` and :code:`exact` are supported. + Alignment tool that is used. Currently :code:`bbmap` :code:`bwa`, :code:`exact` are supported. Default is :code:`bbmap`. :configs: Configurations of the alignment tool selected. @@ -55,11 +55,11 @@ For each assignment you want to process you have to give him a name like :code:` Defines the :code:`min` and :code:`max` of a :code:`sequence_length` specify. :code:`sequence_length` is basically the length of a sequence alignment to an oligo in the design file. Because there can be insertion and deletions we recommend to vary it a bit around the exact length (e.g. +-5). In theory, this option enables designs with multiple sequence lengths. :alignment_start (bwa): Defines the :code:`min` and :code:`max` of the start of the alignment in an oligo. When using adapters you have to set basically the length of the adapter. Otherwise, 1 will be the choice for most cases. We also recommend varying this value a bit because the start might not be exact after the adapter. E.g. by +-1. - :min_mapping_quality (bwa): - (Optional) Defines the minimum mapping quality (MAPQ) of the alignment to an oligo. When using oligos with only 1bp difference it is recommended to set it to 1. For regions only with larger edit distances 30 or 40 might be a good choice. Default :code:`1`. - :sequence_length (exact): + :min_mapping_quality (bwa, bbmap): + (Optional) Defines the minimum mapping quality (MAPQ) of the alignment to an oligo. MAPQs are different between bbmap and bwa. For bwa: When using oligos with only 1bp difference it is recommended to set it to 1. BBMap is better here and we can use for example 30 or 35- For regions only with larger edit distances 30 or 40 might be a good choice. Default :code:`30` (use bbmap). + :sequence_length (exact, bbmap): Defines the :code:`sequence_length` which is the length of a sequence alignment to an oligo in the design file. Only one length design is supported. - :alignment_start (exact): + :alignment_start (exact, bbmap): Defines the start of the alignment in an oligo. When using adapters you have to set basically the length of the adapter. Otherwise, 1 will be the choice for most cases. :bc_length: @@ -168,16 +168,22 @@ The experiment workflow is configured in the :code:`experiments` section. Each e :bc_threshold: Minimum number of different BCs required per oligo. A higher value normally increases the correlation betwene replicates but also reduces the number of final oligos. Default option is :code:`10`. - :DNA: - Settings for DNA - - :min_counts: - Mimimum number of DNA counts per barcode. When set to :code:`0` a pseudo count is added. Default option is :code:`1`. - :RNA: - Settings for DNA + :min_dna_counts: + Mimimum number of DNA counts per barcode. When set to :code:`0` a pseudo count is added. Default option is :code:`1`. + :min_rna_counts: + Mimimum number of RNA counts per barcode. When set to :code:`0` a pseudo count is added. Default option is :code:`1`. + :outlier_detection: + (Optional) Outlier detection. Methods and strategies to remove outlier barcodes in the final counts. The following options are possible: + + :method: + Method to remove outliers. Currently :code:`rna_counts_zscore`, :code:`ratio_mad` or :code:`none` (no outlier detection) are supported. Default option is :code:`rna_counts_zscore`. + :mad_bins: + (Optional) For method :code:`ratio_mad`: Number of bins for the median absolute deviation (MAD) method. Default option is :code:`20`. + :times_mad: + (Optional) For method :code:`ratio_mad`: Times the MAD to remove outliers. Default option is :code:`5`. + :times_zscore: + (Optional) For method :code:`rna_counts_zscore`: Times the zscore to remove outliers. Default option is :code:`3`. - :min_counts: - Mimimum number of RNA counts per barcode. When set to :code:`0` a pseudo count is added. Default option is :code:`1`. :sampling: (Optional) Options for sampling counts and barcodes. Just for debug reasons. diff --git a/docs/experiment.rst b/docs/experiment.rst index 70ba2cf9..672310d2 100644 --- a/docs/experiment.rst +++ b/docs/experiment.rst @@ -88,7 +88,7 @@ Mandatory arguments: :\-\-configfile: Specify or overwrite the config file of the workflow (see the docs). Values specified in JSON or YAML format are available in the global config dictionary inside the workflow. Multiple files overwrite each other in the given order. Thereby missing keys in previous config files are extended by following configfiles. Note that this order also includes a config file defined in the workflow definition itself (which will come first). (default: None) :\-\-sdm: - **Required to run MPRAsnakeflow.** : :code:`--sdm conda` or :code:`--sdm apptainer` Uses the defined conda environment per rule. We highly recommend to use apptainer where we build a predefined docker container with all software installewd within it. :code:`--sdm conda` teh conda envs will be installed by the first excecution of the workflow. If this flag is not set, the conda/apptainer directive is ignored. (default: False) + **Required to run MPRAsnakeflow.** : :code:`--sdm conda` or :code:`--sdm apptainer conda` Uses the defined conda environment per rule. We highly recommend to use apptainer where we build a predefined docker container with all software installewd within it. :code:`--sdm conda` the conda envs will be installed by the first excecution of the workflow. If this flag is not set, the conda/apptainer directive is ignored. (default: False) Recommended arguments: :\-\-snakefile: You should not need to specify this. By default, Snakemake will search for 'Snakefile', 'snakefile', 'workflow/Snakefile','workflow/snakefile' beneath the current working directory, in this order. Only if you definitely want a different layout, you need to use this parameter. This is very usefull when you want to have the results in a different folder than MPRAsnakeflow is in. (default: None) diff --git a/docs/index.rst b/docs/index.rst index 3c5e3f49..b19ce56d 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -4,11 +4,11 @@ MPRAsnakeflow's documentation ==================================== -.. image:: https://img.shields.io/badge/snakemake-≥7.7.1-brightgreen.svg - :target: https://snakemake.bitbucket.io +.. image:: https://img.shields.io/badge/snakemake-≥8.24.1-brightgreen.svg + :target: https://snakemake.github.io/ -.. image:: https://img.shields.io/badge/mamba-≥4.6-brightgreen.svg - :target: https://docs.conda.io/en/latest/miniconda.html +.. image:: https://img.shields.io/badge/conda->24.7.1-brightgreen.svg + :target: https://github.com/conda-forge/miniforge **Welcome!** @@ -16,7 +16,7 @@ MPRAsnakeflow's documentation MPRAsnakeflow pipeline processes sequencing data from Massively Parallel Reporter Assays (MPRAs) to create count tables for candidate sequences tested in the experiment. -MPRAsnakeflow is built on top of `Snakemake `_ (version 8 preferred) and is configured via a ``.yaml`` file. +MPRAsnakeflow is built on top of `Snakemake `_ (version ≥8.24.1 required) and is configured via a ``.yaml`` file. Authors Max Schubach (`@visze `_) @@ -74,7 +74,7 @@ Features * - Option - Description * - ``--software-deployment-method`` - - When ``conda`` is set, the utility uses mamba to efficiently query repositories and query package dependencies. MPRAsnakeflow also can use containers via apptainer by using ``--software-deployment-method apptainer``. Recommended option: ``--software-deployment-method conda apptainer`` + - When ``conda`` is set, the utility uses conda to efficiently query repositories and query package dependencies. MPRAsnakeflow also can use containers via apptainer by using ``--software-deployment-method apptainer conda``. This will use a container to run all rules but inside it will activate the pre-installed conda environments. Recommended option: ``--software-deployment-method apptainer conda`` * - ``--cores`` - This utility sets the number of cores (``$N``) to be used by MPRAsnakeflow. * - ``--configfile`` diff --git a/docs/install.rst b/docs/install.rst index df7dad53..79d2c29e 100644 --- a/docs/install.rst +++ b/docs/install.rst @@ -19,7 +19,7 @@ Package management .. code-block:: bash - conda (mamba) 4.6 or above + conda >24.7.1 or above Download here: https://github.com/conda-forge/miniforge @@ -36,7 +36,7 @@ Workflow language .. code-block:: bash - snakemake 8.16.0 or above (snakemake >=7.15.1 will also work but cli might be different as here documented) + snakemake 8.24.1 or above Download here: https://snakemake.readthedocs.io/ @@ -47,17 +47,17 @@ Clone repository Download here: https://github.com/kircherlab/MPRAsnakeflow.git -Set up snakemake environment with conda/mamba +Set up snakemake environment with conda ============================================= -This pipeline uses python2.7 and python3.6 with additional R scripts in a Snakemake pipeline. The ``.yml`` files provided will create the appropriate environments and is completely handled by MPRAsnakeflow. The whole pipeline is set up to run on a Linux system. +This pipeline uses python2.7 and python ≥3.7 with additional R scripts in a Snakemake pipeline. The ``.yml`` files provided will create the appropriate environments and is completely handled by MPRAsnakeflow. The whole pipeline is set up to run on a Linux system. Install the the conda environment. The general conda environment is called ``snakemake``. .. code-block:: bash cd MPRAsnakeflow - mamba create -c conda-forge -c bioconda -n snakemake snakemake + conda create -c conda-forge -c bioconda -n snakemake snakemake # activate snakemake conda activate snakemake diff --git a/resources/assoc_basic/config.yml b/resources/assoc_basic/config.yml index 9746bc53..10a1020e 100644 --- a/resources/assoc_basic/config.yml +++ b/resources/assoc_basic/config.yml @@ -8,12 +8,8 @@ assignments: alignment_tool: tool: bbmap configs: - sequence_length: - min: 166 - max: 175 - alignment_start: - min: 1 - max: 3 + sequence_length: 171 + alignment_start: 1 FW: - data/SRR10800986_1.fastq.gz BC: diff --git a/resources/combined_basic/config.yml b/resources/combined_basic/config.yml index b1c0a16b..c18a618a 100644 --- a/resources/combined_basic/config.yml +++ b/resources/combined_basic/config.yml @@ -8,12 +8,8 @@ assignments: alignment_tool: tool: bbmap configs: - sequence_length: - min: 166 - max: 175 - alignment_start: - min: 1 - max: 3 + sequence_length: 171 + alignment_start: 1 FW: - data/SRR10800986_1.fastq.gz BC: diff --git a/resources/count_basic/config.yml b/resources/count_basic/config.yml index 4e3e11d3..c9c27a8c 100644 --- a/resources/count_basic/config.yml +++ b/resources/count_basic/config.yml @@ -13,3 +13,11 @@ experiments: design_file: design.fa configs: default: {} + outlierNone: + filter: + outlier_detection: + method: none + outlierZscore: + filter: + outlier_detection: + method: rna_counts_zscore diff --git a/workflow/Snakefile b/workflow/Snakefile index c9eef965..b41e7eb6 100644 --- a/workflow/Snakefile +++ b/workflow/Snakefile @@ -23,7 +23,7 @@ import pandas as pd ##### check snakemake min version ##### from snakemake.utils import min_version -min_version("7.15.1") +min_version("8.24.1") ##### report workflow ##### @@ -134,6 +134,15 @@ rule all: "results/experiments/{project}/assigned_counts/{assignment}/{config}/{condition}_allreps_merged.combined.tsv.gz", "results/experiments/{project}/assigned_counts/{assignment}/{config}/{condition}_allreps_minThreshold_merged.combined.tsv.gz", "results/experiments/{project}/assigned_counts/{assignment}/{config}/{condition}_allreps_merged_barcode_assigned_counts.tsv.gz", + "results/experiments/{project}/assigned_counts/{assignment}/{config}/{condition}_allreps_minThreshold_merged_barcode_assigned_counts.tsv.gz", + "results/experiments/{project}/reporter_experiment.oligo.{condition}.{assignment}.{config}.all.tsv.gz", + "results/experiments/{project}/reporter_experiment.barcode.{condition}.{assignment}.{config}.all.tsv.gz", + ] + ), + getOutputProjectConditionAssignmentConfigThreshold_helper( + [ + "results/experiments/{project}/reporter_experiment.barcode.{condition}.{assignment}.{config}.min_oligo_threshold_{threshold}.tsv.gz", + "results/experiments/{project}/reporter_experiment.oligo.{condition}.{assignment}.{config}.min_oligo_threshold_{threshold}.tsv.gz", ] ), # assignment statistic @@ -263,6 +272,15 @@ rule all_experiments: "results/experiments/{project}/assigned_counts/{assignment}/{config}/{condition}_allreps_merged.combined.tsv.gz", "results/experiments/{project}/assigned_counts/{assignment}/{config}/{condition}_allreps_minThreshold_merged.combined.tsv.gz", "results/experiments/{project}/assigned_counts/{assignment}/{config}/{condition}_allreps_merged_barcode_assigned_counts.tsv.gz", + "results/experiments/{project}/assigned_counts/{assignment}/{config}/{condition}_allreps_minThreshold_merged_barcode_assigned_counts.tsv.gz", + "results/experiments/{project}/reporter_experiment.oligo.{condition}.{assignment}.{config}.all.tsv.gz", + "results/experiments/{project}/reporter_experiment.barcode.{condition}.{assignment}.{config}.all.tsv.gz", + ] + ), + getOutputProjectConditionAssignmentConfigThreshold_helper( + [ + "results/experiments/{project}/reporter_experiment.barcode.{condition}.{assignment}.{config}.min_oligo_threshold_{threshold}.tsv.gz", + "results/experiments/{project}/reporter_experiment.oligo.{condition}.{assignment}.{config}.min_oligo_threshold_{threshold}.tsv.gz", ] ), # assignment statistic diff --git a/workflow/rules/assigned_counts.smk b/workflow/rules/assigned_counts.smk index ef0f1197..c91d1a52 100644 --- a/workflow/rules/assigned_counts.smk +++ b/workflow/rules/assigned_counts.smk @@ -67,7 +67,9 @@ rule assigned_counts_assignBarcodes: script=getScript("count/merge_BC_and_assignment.py"), output: counts="results/experiments/{project}/assigned_counts/{assignment}/{condition}_{replicate}_{type}_final_counts.config.{config}.tsv.gz", - stats="results/experiments/{project}/statistic/assigned_counts/{assignment}/{condition}_{replicate}_{type}_{config}.statistic.tsv.gz", + statistic=temp( + "results/experiments/{project}/statistic/assigned_counts/{assignment}/{condition}_{replicate}_{type}_{config}.statistic.tsv.gz" + ), params: name="{condition}_{replicate}_{type}", log: @@ -79,7 +81,7 @@ rule assigned_counts_assignBarcodes: python {input.script} --counts {input.counts} \ --assignment {input.association} \ --output {output.counts} \ - --statistic {output.stats} \ + --statistic {output.statistic} \ --name {params.name} &> {log} """ @@ -97,14 +99,40 @@ rule assigned_counts_dna_rna_merge: output: counts="results/experiments/{project}/assigned_counts/{assignment}/{config}/{condition}_{replicate}_merged_assigned_counts.tsv.gz", bc_counts="results/experiments/{project}/assigned_counts/{assignment}/{config}/{condition}_{replicate}_barcode_assigned_counts.tsv.gz", - stats="results/experiments/{project}/statistic/assigned_counts/{assignment}/{config}/{condition}_{replicate}_merged_assigned_counts.statistic.tsv.gz", + removed_bcs="results/experiments/{project}/assigned_counts/{assignment}/{config}/{condition}_{replicate}_barcodesRemoved_assigned_counts.tsv.gz", + statistic=temp( + "results/experiments/{project}/statistic/assigned_counts/{assignment}/{config}/{condition}_{replicate}_merged_assigned_counts.statistic.tsv.gz" + ), params: minRNACounts=lambda wc: config["experiments"][wc.project]["configs"][ wc.config - ]["filter"]["RNA"]["min_counts"], + ]["filter"]["min_rna_counts"], minDNACounts=lambda wc: config["experiments"][wc.project]["configs"][ wc.config - ]["filter"]["DNA"]["min_counts"], + ]["filter"]["min_dna_counts"], + outlier_detection=lambda wc: ( + "--outlier-detection %s " + % config["experiments"][wc.project]["configs"][wc.config]["filter"][ + "outlier_detection" + ]["method"] + if config["experiments"][wc.project]["configs"][wc.config]["filter"][ + "outlier_detection" + ]["method"] + != "none" + else "" + ), + outlier_mad_bins=lambda wc: "--outlier-ratio-mad-bins %d" + % config["experiments"][wc.project]["configs"][wc.config]["filter"][ + "outlier_detection" + ]["mad_bins"], + outlier_mad_times=lambda wc: "--outlier-ratio-mad-times %f" + % config["experiments"][wc.project]["configs"][wc.config]["filter"][ + "outlier_detection" + ]["times_mad"], + outlier_zscore_times=lambda wc: "--outlier-rna-zscore-times %f" + % config["experiments"][wc.project]["configs"][wc.config]["filter"][ + "outlier_detection" + ]["times_zscore"], log: temp( "results/logs/assigned_counts/{assignment}/dna_rna_merge.{project}.{condition}.{replicate}.{config}.log" @@ -114,9 +142,11 @@ rule assigned_counts_dna_rna_merge: python {input.script} --counts {input.counts} \ --minRNACounts {params.minRNACounts} --minDNACounts {params.minDNACounts} \ --assignment {input.association} \ + {params.outlier_detection} --outlier-barcodes {output.removed_bcs} \ + {params.outlier_mad_bins} {params.outlier_mad_times} {params.outlier_zscore_times} \ --output {output.counts} \ --bcOutput {output.bc_counts} \ - --statistic {output.stats} &> {log} + --statistic {output.statistic} &> {log} """ @@ -137,7 +167,6 @@ rule assigned_counts_make_master_tables: all="results/experiments/{project}/assigned_counts/{assignment}/{config}/{condition}_allreps_merged.tsv.gz", thresh="results/experiments/{project}/assigned_counts/{assignment}/{config}/{condition}_allreps_minThreshold_merged.tsv.gz", params: - cond="{condition}", files=lambda wc: ",".join( expand( "results/experiments/{project}/assigned_counts/{assignment}/{config}/{condition}_{replicate}_merged_assigned_counts.tsv.gz", @@ -161,7 +190,6 @@ rule assigned_counts_make_master_tables: shell: """ Rscript {input.script} \ - --condition {params.cond} \ --threshold {params.thresh} \ --files {params.files} \ --replicates {params.replicates} \ @@ -185,7 +213,8 @@ rule assigned_counts_combine_replicates_barcode_output: ), script=getScript("count/merge_replicates_barcode_counts.py"), output: - bc_merged="results/experiments/{project}/assigned_counts/{assignment}/{config}/{condition}_allreps_merged_barcode_assigned_counts.tsv.gz", + bc_merged_thresh="results/experiments/{project}/assigned_counts/{assignment}/{config}/{condition}_allreps_minThreshold_merged_barcode_assigned_counts.tsv.gz", + bc_merged_all="results/experiments/{project}/assigned_counts/{assignment}/{config}/{condition}_allreps_merged_barcode_assigned_counts.tsv.gz", params: thresh=lambda wc: config["experiments"][wc.project]["configs"][wc.config][ "filter" @@ -218,7 +247,8 @@ rule assigned_counts_combine_replicates_barcode_output: python {input.script} {params.bc_counts} \ --threshold {params.thresh} \ {params.replicates} \ - --output {output.bc_merged} &> {log} + --output-threshold {output.bc_merged_thresh} \ + --output {output.bc_merged_all} &> {log} """ @@ -250,3 +280,49 @@ rule assigned_counts_combine_replicates: {params.label_file} \ --output {output} &> {log} """ + + +rule assigned_counts_copy_final_all_files: + """ + Will copy final files to the main folder so that it is creal which files to use. + """ + conda: + "../envs/default.yaml" + input: + all=lambda wc: "results/experiments/{project}/assigned_counts/{assignment}/{config}/{condition}_allreps_merged.tsv.gz", + bc_all=lambda wc: "results/experiments/{project}/assigned_counts/{assignment}/{config}/{condition}_allreps_merged_barcode_assigned_counts.tsv.gz", + output: + all="results/experiments/{project}/reporter_experiment.oligo.{condition}.{assignment}.{config}.all.tsv.gz", + bc_all="results/experiments/{project}/reporter_experiment.barcode.{condition}.{assignment}.{config}.all.tsv.gz", + log: + temp( + "results/logs/assigned_counts/copy_final_all_files.{project}.{condition}.{assignment}.{config}.log" + ), + shell: + """ + cp {input.all} {output.all} &> {log} + cp {input.bc_all} {output.bc_all} &>> {log} + """ + + +rule assigned_counts_copy_final_thresh_files: + """ + Will copy final files to the main folder so that it is creal which files to use. + """ + conda: + "../envs/default.yaml" + input: + thresh=lambda wc: "results/experiments/{project}/assigned_counts/{assignment}/{config}/{condition}_allreps_minThreshold_merged.tsv.gz", + bc_thresh=lambda wc: "results/experiments/{project}/assigned_counts/{assignment}/{config}/{condition}_allreps_minThreshold_merged_barcode_assigned_counts.tsv.gz", + output: + thresh="results/experiments/{project}/reporter_experiment.oligo.{condition}.{assignment}.{config}.min_oligo_threshold_{threshold}.tsv.gz", + bc_thresh="results/experiments/{project}/reporter_experiment.barcode.{condition}.{assignment}.{config}.min_oligo_threshold_{threshold}.tsv.gz", + log: + temp( + "results/logs/assigned_counts/copy_final_thresh_files.{project}.{condition}.{assignment}.{config}.{threshold}.log" + ), + shell: + """ + cp {input.thresh} {output.thresh} &> {log} + cp {input.bc_thresh} {output.bc_thresh} &>> {log} + """ diff --git a/workflow/rules/assignment.smk b/workflow/rules/assignment.smk index f3d7a572..fe3aaf2b 100644 --- a/workflow/rules/assignment.smk +++ b/workflow/rules/assignment.smk @@ -29,28 +29,24 @@ rule assignment_check_design: start=lambda wc: ( config["assignments"][wc.assignment]["alignment_tool"]["configs"][ "alignment_start" - ] - if config["assignments"][wc.assignment]["alignment_tool"]["tool"] - == "exact" + ]["max"] + if config["assignments"][wc.assignment]["alignment_tool"]["tool"] == "bwa" else config["assignments"][wc.assignment]["alignment_tool"]["configs"][ "alignment_start" - ]["max"] + ] ), length=lambda wc: ( config["assignments"][wc.assignment]["alignment_tool"]["configs"][ "sequence_length" - ] - if config["assignments"][wc.assignment]["alignment_tool"]["tool"] - == "exact" + ]["min"] + if config["assignments"][wc.assignment]["alignment_tool"]["tool"] == "bwa" else config["assignments"][wc.assignment]["alignment_tool"]["configs"][ "sequence_length" - ]["min"] + ] ), fast_check=lambda wc: ( "--fast-dict" if config["assignments"][wc.assignment]["design_check"]["fast"] - or config["assignments"][wc.assignment]["alignment_tool"]["tool"] - == "bbmap" else "--slow-string-search" ), check_sequence_collitions=lambda wc: ( @@ -248,7 +244,7 @@ rule assignment_filter: python {input.script} \ -m {params.min_support} -f {params.fraction} {params.unknown_other} {params.ambiguous} | \ tee >(gzip -c > {output.ambigous}) | \ - awk -v "OFS=\\t" -F"\\t" '{{ if (($2 != \"ambiguous\") && ($2 != \"other\")) {{ print $0 }} }}' | \ + awk -v "OFS=\\t" -F"\\t" '{{ if (($2 != \"ambiguous\") && ($2 != \"other\")) {{ print $1,$2 }} }}' | \ gzip -c > {output.final} 2> {log.err}; gzip -l {output.final} | awk 'NR==2 {{exit($2==0)}}' || {{ echo "Error: Empty barcode file {output.final}. No barcodes detected!" >> {log.err}; exit 1; }} """ diff --git a/workflow/rules/assignment/statistic.smk b/workflow/rules/assignment/statistic.smk index 35b6c64f..3af3cfad 100644 --- a/workflow/rules/assignment/statistic.smk +++ b/workflow/rules/assignment/statistic.smk @@ -48,7 +48,7 @@ rule assignment_statistic_assignment: conda: "../../envs/r.yaml" input: - bc="results/assignment/{assignment}/assignment_barcodes.{assignment_config}.tsv.gz", + bc="results/assignment/{assignment}/assignment_barcodes_with_ambigous.{assignment_config}.tsv.gz", script=getScript("assignment/statistic_assignment.R"), output: stats="results/assignment/{assignment}/statistic/assignment.{assignment_config}.tsv.gz", diff --git a/workflow/rules/common.smk b/workflow/rules/common.smk index c0615c9b..42e50ed5 100644 --- a/workflow/rules/common.smk +++ b/workflow/rules/common.smk @@ -374,6 +374,34 @@ def getOutputProjectConditionAssignmentConfig_helper(files): return output +def getOutputProjectConditionAssignmentConfigThreshold_helper(files): + """ + Inserts {project}, {condition}, {assignment} {config} (from configs of project) and Threshold from config into given file. + """ + output = [] + projects = getProjects() + for project in projects: + try: + conditions = getConditions(project) + for condition in conditions: + for conf in getConfigs(project): + threshold = config["experiments"][project]["configs"][conf][ + "filter" + ]["bc_threshold"] + for file in files: + output += expand( + file, + project=project, + condition=condition, + assignment=getProjectAssignments(project), + config=conf, + threshold=threshold, + ) + except MissingAssignmentInConfigException: + continue + return output + + def getOutputProjectAssignmentConfig_helper(files, betweenReplicates=False): """ Inserts {project}, {assignment} and {config} (from configs of project) from config into given file. @@ -471,11 +499,8 @@ def useSampling(project, conf, dna_or_rna): def withoutZeros(project, conf): return ( - config["experiments"][project]["configs"][conf]["filter"]["DNA"]["min_counts"] - > 0 - and config["experiments"][project]["configs"][conf]["filter"]["RNA"][ - "min_counts" - ] + config["experiments"][project]["configs"][conf]["filter"]["min_dna_counts"] > 0 + and config["experiments"][project]["configs"][conf]["filter"]["min_rna_counts"] > 0 ) @@ -608,8 +633,8 @@ def counts_aggregate_demultiplex_input(project): def counts_getFilterConfig(project, conf, dna_or_rna, command): - value = config["experiments"][project]["configs"][conf]["filter"][dna_or_rna][ - command + value = config["experiments"][project]["configs"][conf]["filter"][ + "min_%s_counts" % dna_or_rna.lower() ] filterMap = {"min_counts": "minCounts"} if isinstance(value, int): diff --git a/workflow/rules/statistic/assigned_counts.smk b/workflow/rules/statistic/assigned_counts.smk index 9a32af44..15ae1ad7 100644 --- a/workflow/rules/statistic/assigned_counts.smk +++ b/workflow/rules/statistic/assigned_counts.smk @@ -86,7 +86,9 @@ rule statistic_assigned_counts_combine_stats_dna_rna_merge: ), script=getScript("count/merge_statistic_tables.py"), output: - "results/experiments/{project}/statistic/assigned_counts/{assignment}/{config}/combined/{condition}_merged_assigned_counts.statistic.tsv.gz", + temp( + "results/experiments/{project}/statistic/assigned_counts/{assignment}/{config}/combined/{condition}_merged_assigned_counts.statistic.tsv.gz" + ), params: cond="{condition}", statistic=lambda wc: " ".join( diff --git a/workflow/rules/statistic/bc_overlap.smk b/workflow/rules/statistic/bc_overlap.smk index 77e01cb8..7ac65bd1 100644 --- a/workflow/rules/statistic/bc_overlap.smk +++ b/workflow/rules/statistic/bc_overlap.smk @@ -50,7 +50,7 @@ rule statistic_bc_overlap_combine_counts: conda: "../../envs/default.yaml" input: - stats=lambda wc: expand( + statistic=lambda wc: expand( "results/experiments/{{project}}/statistic/bc_overlap/counts/overlapBCandCounts.{condition}_{type}.{config}.tsv", type=["DNA", "RNA"], condition=getConditions(wc.project), @@ -74,8 +74,8 @@ rule statistic_bc_overlap_combine_counts: """ set +o pipefail; ( - cat {input.stats[0]} | head -n 1; - for i in {input.stats}; do + cat {input.statistic[0]} | head -n 1; + for i in {input.statistic}; do cat $i | tail -n +2 done; ) > {output} 2> {log} @@ -86,7 +86,7 @@ rule statistic_bc_overlap_combine_assigned_counts: conda: "../../envs/default.yaml" input: - stats=lambda wc: expand( + statistic=lambda wc: expand( "results/experiments/{{project}}/statistic/bc_overlap/assigned_counts/{{assignment}}/overlapBCandCounts.{condition}_{type}.{{config}}.tsv", type=["DNA", "RNA"], condition=getConditions(wc.project), @@ -111,8 +111,8 @@ rule statistic_bc_overlap_combine_assigned_counts: """ set +o pipefail; ( - cat {input.stats[0]} | head -n 1; - for i in {input.stats}; do + cat {input.statistic[0]} | head -n 1; + for i in {input.statistic}; do cat $i | tail -n +2 done; ) > {output} 2> {log} diff --git a/workflow/rules/statistic/correlation.smk b/workflow/rules/statistic/correlation.smk index 4175ec4a..90eb1d53 100644 --- a/workflow/rules/statistic/correlation.smk +++ b/workflow/rules/statistic/correlation.smk @@ -45,7 +45,9 @@ rule statistic_correlation_bc_counts: "Plot": "Ratio", }, ), - "results/experiments/{project}/statistic/barcode/{raw_or_assigned}/{condition}_{config}_barcode_correlation.tsv", + temp( + "results/experiments/{project}/statistic/barcode/{raw_or_assigned}/{condition}_{config}_barcode_correlation.tsv" + ), params: replicates=lambda wc: ",".join( getMergedCounts(wc.project, wc.raw_or_assigned, wc.condition, wc.config)[1] @@ -57,10 +59,10 @@ rule statistic_correlation_bc_counts: ), minRNACounts=lambda wc: config["experiments"][wc.project]["configs"][ wc.config - ]["filter"]["RNA"]["min_counts"], + ]["filter"]["min_rna_counts"], minDNACounts=lambda wc: config["experiments"][wc.project]["configs"][ wc.config - ]["filter"]["DNA"]["min_counts"], + ]["filter"]["min_dna_counts"], log: temp( "results/logs/statistic/correlation/correlate_bc_counts.{project}.{condition}.{config}.{raw_or_assigned}.log" @@ -97,10 +99,10 @@ rule statistic_correlation_bc_counts_hist: ), minRNACounts=lambda wc: config["experiments"][wc.project]["configs"][ wc.config - ]["filter"]["RNA"]["min_counts"], + ]["filter"]["min_rna_counts"], minDNACounts=lambda wc: config["experiments"][wc.project]["configs"][ wc.config - ]["filter"]["DNA"]["min_counts"], + ]["filter"]["min_dna_counts"], log: temp( "results/logs/statistic/correlation/correlate_bc_counts_hist.{project}.{condition}.{config}.{raw_or_assigned}.log" @@ -287,8 +289,12 @@ rule statistic_correlation_calculate: ), }, ), - "results/experiments/{project}/statistic/assigned_counts/{assignment}/{config}/{condition}_correlation.tsv", - "results/experiments/{project}/statistic/assigned_counts/{assignment}/{config}/{condition}_correlation_minThreshold.tsv", + temp( + "results/experiments/{project}/statistic/assigned_counts/{assignment}/{config}/{condition}_correlation.tsv" + ), + temp( + "results/experiments/{project}/statistic/assigned_counts/{assignment}/{config}/{condition}_correlation_minThreshold.tsv" + ), params: cond="{condition}", files=lambda wc: ",".join( diff --git a/workflow/rules/variants.smk b/workflow/rules/variants.smk index 3b46a721..69894b32 100644 --- a/workflow/rules/variants.smk +++ b/workflow/rules/variants.smk @@ -48,10 +48,10 @@ rule variants_MasterTable: ), minRNACounts=lambda wc: config["experiments"][wc.project]["configs"][ wc.config - ]["filter"]["RNA"]["min_counts"], + ]["filter"]["min_rna_counts"], minDNACounts=lambda wc: config["experiments"][wc.project]["configs"][ wc.config - ]["filter"]["DNA"]["min_counts"], + ]["filter"]["min_dna_counts"], log: temp( "results/logs/experiments/variants/MasterTable.{project}.{assignment}/{config}.{condition}.log" diff --git a/workflow/schemas/config.schema.yaml b/workflow/schemas/config.schema.yaml index 5ededb35..1f069180 100644 --- a/workflow/schemas/config.schema.yaml +++ b/workflow/schemas/config.schema.yaml @@ -99,29 +99,13 @@ properties: min_mapping_quality: type: integer minimum: 0 - default: 35 + default: 30 sequence_length: - type: object - properties: - min: - type: integer - max: - type: integer - additionalProperties: false - required: - - min - - max + type: integer + minimum: 1 alignment_start: - type: object - properties: - min: - type: integer - max: - type: integer - additionalProperties: false - required: - - min - - max + type: integer + minimum: 1 additionalProperties: false required: - min_mapping_quality @@ -339,33 +323,54 @@ properties: properties: filter: type: object + default: {} properties: bc_threshold: type: integer minimum: 1 default: 10 - patternProperties: - ^((DNA)|(RNA))$: + outlier_detection: type: object properties: - min_counts: + method: + type: string + enum: + - rna_counts_zscore + - ratio_mad + - none + default: rna_counts_zscore + mad_bins: type: integer - miminum: 0 - default: 1 - additionalProperties: false + minimum: 1 + default: 20 + times_mad: + type: number + exclusiveMinimum: 0 + default: 5 + times_zscore: + type: number + exclusiveMinimum: 0 + default: 3 required: - - min_counts - default: - bc_threshold: 10 - DNA: - min_counts: 1 - RNA: - min_counts: 1 + - method + - mad_bins + - times_mad + - times_zscore + additionalProperties: false + default: {} + min_dna_counts: + type: integer + miminum: 0 + default: 1 + min_rna_counts: + type: integer + miminum: 0 + default: 1 + additionalProperties: false required: - bc_threshold - - DNA - - RNA - additionalProperties: false + - min_rna_counts + - min_dna_counts sampling: type: object patternProperties: diff --git a/workflow/scripts/assignment/statistic_assignment.R b/workflow/scripts/assignment/statistic_assignment.R index 9beb9092..e699d586 100644 --- a/workflow/scripts/assignment/statistic_assignment.R +++ b/workflow/scripts/assignment/statistic_assignment.R @@ -3,38 +3,38 @@ library(optparse) option_list <- list( - make_option(c("-i", "--input"), - type = "character", - help = "Assignment parcode file" - ), - make_option(c("-p", "--plot"), - type = "character", - help = "histogram plot" - ), - make_option(c("-s", "--statistic"), - type = "character", - help = "Statistics" - ) + make_option(c("-i", "--input"), + type = "character", + help = "Assignment parcode file" + ), + make_option(c("-p", "--plot"), + type = "character", + help = "histogram plot" + ), + make_option(c("-s", "--statistic"), + type = "character", + help = "Statistics" + ) ) arguments <- parse_args(OptionParser(option_list = option_list), positional_arguments = TRUE) opt <- arguments$options if (!"input" %in% names(opt)) { - stop("--input parameter must be provided. See script usage (--help)") + stop("--input parameter must be provided. See script usage (--help)") } if (!"plot" %in% names(opt)) { - stop("--plot parameter must be provided. See script usage (--help)") + stop("--plot parameter must be provided. See script usage (--help)") } if (!"statistic" %in% names(opt)) { - stop("--statistic parameter must be provided. See script usage (--help)") + stop("--statistic parameter must be provided. See script usage (--help)") } bcs <- read_delim(opt$input, delim = "\t", escape_double = FALSE, col_names = FALSE, trim_ws = TRUE) bcs <- bcs %>% - filter(!(X2 %in% c("other", "ambiguous"))) %>% - separate(X4, c("support", "all"), sep = "/", convert = TRUE) + filter(!(X2 %in% c("other", "ambiguous"))) %>% + separate(X4, c("support", "all"), sep = "/", convert = TRUE) @@ -42,13 +42,13 @@ avg_support <- mean(bcs$support / bcs$all) median_support <- median(bcs$support / bcs$all) bcs <- bcs %>% - group_by(X2) %>% - count() %>% - ungroup() + group_by(X2) %>% + count() %>% + ungroup() oligos_support <- bcs %>% - filter(n >= 15) %>% - nrow() + filter(n >= 15) %>% + nrow() output <- data.frame(Value = c(avg_support, median_support, oligos_support)) @@ -57,23 +57,25 @@ rownames(output) <- c("Average support of BC for Oligo:", "Median support of BC write.table(output, file = opt$statistic, quote = FALSE, sep = "\t", row.names = TRUE, col.names = FALSE) p <- ggplot(bcs, aes(n)) + - geom_histogram(bins = 150) + - xlab("Number of BCs") + - ylab("Number of Oligos") + - geom_vline(aes(xintercept = mean(bcs$n)), col = "red", linewidth = 2) + - geom_text( - aes( - label = sprintf("\nmean = %.2f", mean(bcs$n)), - x = Inf, y = Inf - ), hjust = 1.1, vjust = 1.3, colour = "red" - ) + - geom_vline(aes(xintercept = median(bcs$n)), col = "blue", linewidth = 2) + - geom_text( - aes( - label = sprintf("median = %.2f\n", median(bcs$n)), - x = Inf, y = Inf - ), hjust = 1.1, vjust = 1.3, colour = "blue" - ) + - theme_bw() + geom_histogram(bins = 150) + + xlab("Number of BCs") + + ylab("Number of Oligos") + + geom_vline(aes(xintercept = mean(bcs$n)), col = "red", linewidth = 2) + + geom_text( + aes( + label = sprintf("\nmean = %.2f", mean(bcs$n)), + x = Inf, y = Inf + ), + hjust = 1.1, vjust = 1.3, colour = "red" + ) + + geom_vline(aes(xintercept = median(bcs$n)), col = "blue", linewidth = 2) + + geom_text( + aes( + label = sprintf("median = %.2f\n", median(bcs$n)), + x = Inf, y = Inf + ), + hjust = 1.1, vjust = 1.3, colour = "blue" + ) + + theme_bw() ggsave(opt$plot, p, type = "cairo", dpi = 300) diff --git a/workflow/scripts/count/combine_replicates.py b/workflow/scripts/count/combine_replicates.py index 7e3e69d5..d5af1c44 100755 --- a/workflow/scripts/count/combine_replicates.py +++ b/workflow/scripts/count/combine_replicates.py @@ -42,22 +42,21 @@ def cli(input_file, label_file, output_file): df_out.to_csv(output_file, sep="\t", index=False) def combine_replicates(label_file, df_allreps, total_dna_counts, total_rna_counts): - df_allreps = df_allreps.groupby(by=["condition", "name"]).aggregate( + df_allreps = df_allreps.groupby(by=["oligo_name"]).aggregate( { "replicate": "count", "dna_counts": ["sum", "mean"], "rna_counts": ["sum", "mean"], "dna_normalized": "mean", "rna_normalized": "mean", - "ratio": "mean", - "log2": "mean", - "n_obs_bc": ["sum", "mean"], + "log2FoldChange": "mean", + "n_bc": ["sum", "mean"], } ) df_allreps = df_allreps.reset_index() - df_out = df_allreps.iloc[:, 0:2] - df_out.columns = ["condition", "name"] + df_out = df_allreps.iloc[:, 0:1] + df_out.columns = ["oligo_name"] df_out["replicates"] = df_allreps.replicate["count"] @@ -70,20 +69,19 @@ def combine_replicates(label_file, df_allreps, total_dna_counts, total_rna_count df_out["dna_normalized"] = df_out["dna_counts"] / total_dna_counts * scaling df_out["rna_normalized"] = df_out["rna_counts"] / total_rna_counts * scaling - df_out["ratio"] = df_out["rna_normalized"] / df_out["dna_normalized"] - df_out["log2"] = np.log2(df_out.ratio) + df_out["log2FoldChange"] = np.log2(df_out["rna_normalized"] / df_out["dna_normalized"]) df_out["mean_dna_counts"] = df_allreps.dna_counts["mean"] df_out["mean_rna_counts"] = df_allreps.rna_counts["mean"] df_out["mean_dna_normalized"] = df_allreps.dna_normalized["mean"] df_out["mean_rna_normalized"] = df_allreps.rna_normalized["mean"] - df_out["mean_ratio"] = df_allreps.ratio["mean"] - df_out["mean_log2"] = df_allreps.log2["mean"] + + df_out["mean_log2FoldChange"] = df_allreps.log2FoldChange["mean"] if label_file: - df_labels = pd.read_csv(label_file, sep="\t", names=["name", "Label"]) - df_out = df_out.join(df_labels.set_index("name"), on="name") + df_labels = pd.read_csv(label_file, sep="\t", names=["oligo_name", "Label"]) + df_out = df_out.join(df_labels.set_index("oligo_name"), on="oligo_name") return df_out diff --git a/workflow/scripts/count/make_master_tables.R b/workflow/scripts/count/make_master_tables.R index 569337f9..03195627 100644 --- a/workflow/scripts/count/make_master_tables.R +++ b/workflow/scripts/count/make_master_tables.R @@ -1,4 +1,4 @@ -#Adapted from Vikram Agarwal by Gracie Gordon +# Adapted from Vikram Agarwal by Gracie Gordon and Max Schubach library(dplyr) @@ -6,30 +6,39 @@ library(optparse) option_list <- list( - make_option(c("-c", "--condition"), type="character", - help="Condition name"), - make_option(c("-l", "--label"), type="character", - help="Label file. (optional)"), - make_option(c("-f", "--files"), type="character", - help="Comma separated input files of assigned counts"), - make_option(c("-r", "--replicates"), type="character", - help="Comma separated name of the replicates (same order than files)"), - make_option(c("-a", "--output-all"), type="character", - help="Output file of master table. No BC threshold filter (optional)."), - make_option(c("-o", "--output"), type="character", - help="Output file of master table filtered by --threshold"), - make_option(c("-s", "--statistic"), type="character", - help="Statistic of master table and filtered master table"), - make_option(c("-t", "--threshold"), type="integer", default=10, - help="Number of required barcodes (default 10)") + make_option(c("-l", "--label"), + type = "character", + help = "Label file. (optional)" + ), + make_option(c("-f", "--files"), + type = "character", + help = "Comma separated input files of assigned counts" + ), + make_option(c("-r", "--replicates"), + type = "character", + help = "Comma separated name of the replicates (same order than files)" + ), + make_option(c("-a", "--output-all"), + type = "character", + help = "Output file of master table. No BC threshold filter (optional)." + ), + make_option(c("-o", "--output"), + type = "character", + help = "Output file of master table filtered by --threshold" + ), + make_option(c("-s", "--statistic"), + type = "character", + help = "Statistic of master table and filtered master table" + ), + make_option(c("-t", "--threshold"), + type = "integer", default = 10, + help = "Number of required barcodes (default 10)" + ) ) -arguments <- parse_args(OptionParser(option_list=option_list), positional_arguments=TRUE) +arguments <- parse_args(OptionParser(option_list = option_list), positional_arguments = TRUE) opt <- arguments$options -if (!"condition" %in% names(opt)) { - stop("--condition parameter must be provided. See script usage (--help)") -} if (!"files" %in% names(opt)) { stop("--files parameter must be provided. See script usage (--help)") } @@ -44,69 +53,88 @@ if (!"statistic" %in% names(opt)) { } - -#exp=args[1] -cond=opt$cond -thresh=opt$threshold -files <- strsplit(opt$files,",")[[1]] -replicates=strsplit(opt$replicates,",")[[1]] +# exp=args[1] +cond <- opt$cond +thresh <- opt$threshold +files <- strsplit(opt$files, ",")[[1]] +replicates <- strsplit(opt$replicates, ",")[[1]] if (length(files) != length(replicates)) { - stop("Number of input files must be euqal to number of replicates") + stop("Number of input files must be euqal to number of replicates") } -outfile=opt$output -avg_outfile=opt$statistic -#out=args[3] +outfile <- opt$output +avg_outfile <- opt$statistic -##MAKE MASTER TABLE -masterTable = data.frame() -for (i in 1:length(files)){ - file=files[i] - rep=replicates[i] +precision <- 4 - table = as.data.frame(read.table(file,header=TRUE),stringsAsFactors=FALSE) - table$condition = cond - table$replicate = rep - - masterTable <- masterTable %>% bind_rows(table) -} +## MAKE MASTER TABLE +master_table <- data.frame() +for (i in seq_len(length(files))) { + file <- files[i] + rep <- replicates[i] -masterTable <- masterTable %>% group_by(condition,replicate) %>% - select(condition, replicate, name, dna_counts, rna_counts, dna_normalized, rna_normalized, ratio, log2, n_obs_bc) + table <- as.data.frame(read.table(file, header = TRUE), stringsAsFactors = FALSE) + table$replicate <- rep -masterTableFiltered <- masterTable %>% - filter(n_obs_bc >= thresh) %>% - mutate(ratio = ratio/median(ratio), log2 = round(log2(ratio),8)) -masterTable <- masterTable %>% mutate(ratio = ratio/median(ratio), log2 = round(log2(ratio),8)) + master_table <- master_table %>% bind_rows(table) +} -writeFile <- function(file,table) { +master_table <- master_table %>% + group_by(replicate) %>% + filter(!oligo_name %in% c("no_BC")) %>% + mutate( + dna_normalized = round(dna_normalized, precision), + rna_normalized = round(rna_normalized, precision), + ratio = round(ratio, precision), + log2FoldChange = round(log2FoldChange, precision) + ) + +master_table_filtered <- master_table %>% + filter(n_bc >= thresh) +# TODO use this? Maybe not because it normalizes across all replicates +# master_table_filtered %>% mutate(ratio = ratio / median(ratio)) + +# TODO use this? Maybe not because it normalizes across all replicates +# master_table %>% mutate(ratio = ratio / median(ratio)) + +write_file <- function(file, table) { gz <- gzfile(file, "w") - write.table(table,file=gz,quote=FALSE,sep='\t',row.names=FALSE,col.names=TRUE ) + write.table( + table, + file = gz, quote = FALSE, sep = "\t", row.names = FALSE, col.names = TRUE + ) close(gz) } -writeFile(outfile,masterTableFiltered) +write_file( + outfile, + master_table_filtered %>% + select(replicate, oligo_name, dna_counts, rna_counts, dna_normalized, rna_normalized, log2FoldChange, n_bc) +) if ("output-all" %in% names(opt)) { - writeFile(opt$`output-all`, masterTable) + write_file( + opt$`output-all`, + master_table %>% + select(replicate, oligo_name, dna_counts, rna_counts, dna_normalized, rna_normalized, log2FoldChange, n_bc) + ) } -##MAKE AVERAGED ACROSS REPLICATES -makeAverageAcrossReplicates <- function(table, name) { +## MAKE AVERAGED ACROSS REPLICATES +make_average_across_replicates <- function(table, name) { avg <- table %>% summarize( - mean_ratio=mean(ratio), - mean_log2=log2(mean(ratio)), - mean_n_obs_bc=mean(n_obs_bc) - ) - avg$BC_filter = name + mean_ratio = mean(ratio), + mean_log2FoldChange = mean(log2FoldChange), + mean_n_bc = mean(n_bc) + ) + avg$BC_filter <- name return(avg) } -all_avg <- makeAverageAcrossReplicates(masterTable, 'None') %>% - bind_rows(makeAverageAcrossReplicates(masterTableFiltered, paste0("n_obs_bc >= ", thresh))) - -writeFile(avg_outfile, all_avg) +all_avg <- make_average_across_replicates(master_table, "None") %>% + bind_rows(make_average_across_replicates(master_table_filtered, paste0("n_bc >= ", thresh))) +write_file(avg_outfile, all_avg) -print('done') +print("done") diff --git a/workflow/scripts/count/merge_label.py b/workflow/scripts/count/merge_label.py index 2a0b16d3..d6005a6c 100644 --- a/workflow/scripts/count/merge_label.py +++ b/workflow/scripts/count/merge_label.py @@ -39,6 +39,56 @@ type=int, help="Minimum number of DNA counts required per barcode. If 0 pesudocounts are used.", ) +@click.option( + "--inlclude-not-assigned-for-normalization/--exclude-not-assigned-for-normalization", + "normalize_with_not_assigned", + default=False, + show_default=True, + help="Use barcodes that are not assigned for normalization.", +) +@click.option( + "--scaling", + "scaling", + default=10**6, + show_default=True, + type=float, + help="Scaling parameter. Usually counts per million (10**6).", +) +@click.option( + "--outlier-detection", + "outlier_detection_method", + required=False, + type=click.Choice(['ratio_mad', 'rna_counts_zscore'], case_sensitive=False), + help="If set it wil use obe of the outlier detection methods.", +) +@click.option( + "--outlier-barcodes", + "removed_barcodes_file", + required=False, + type=click.Path(writable=True), + help="If outlier detection is enabled it will write out the barcodes that were removed.", +) +@click.option( + "--outlier-ratio-mad-bins", + "ratio_mad_n_bins", + default = 20, + type=int, + help="Number of quantile bins for RNA counts where outlier removal is processed.", +) +@click.option( + "--outlier-ratio-mad-times", + "ratio_mad_times", + default = 5, + type=float, + help="ratio diff does not be larger than the value time mad within a bin.", +) +@click.option( + "--outlier-rna-zscore-times", + "rna_zscore_times", + default = 3, + type=float, + help="ratio diff does not be larger than the value time mad within a bin.", +) @click.option( "--output", "output_file", @@ -64,6 +114,13 @@ def cli( assignment_file, minRNACounts, minDNACounts, + normalize_with_not_assigned, + scaling, + outlier_detection_method, + ratio_mad_times, + ratio_mad_n_bins, + rna_zscore_times, + removed_barcodes_file, output_file, statistic_file, bc_output_file, @@ -86,6 +143,7 @@ def cli( "total rna counts": [0], "avg dna counts per bc": [0.0], "avg rna counts per bc": [0.0], + "barcode outlier removed": [0.0], "avg dna/rna barcodes per oligo": [0.0], } ) @@ -97,19 +155,19 @@ def cli( header=None, usecols=[0, 1], sep="\t", - names=["Barcode", "Oligo"], + names=["barcode", "oligo"], ) # drop duplicated barcodes!!! - assoc.drop_duplicates("Barcode", keep=False, inplace=True) - assoc.set_index("Barcode", inplace=True) + assoc.drop_duplicates("barcode", keep=False, inplace=True) + assoc.set_index("barcode", inplace=True) - statistic["oligos design"] = assoc.Oligo.nunique() + statistic["oligos design"] = assoc.oligo.nunique() statistic[["barcodes design"]] = assoc.shape[0] # get count df click.echo("Read count file...") counts = pd.read_csv( - counts_file, sep="\t", header=None, names=["Barcode", "dna_count", "rna_count"] + counts_file, sep="\t", header=None, names=["barcode", "dna_count", "rna_count"] ) # filter counts = counts[ @@ -120,11 +178,11 @@ def cli( # fill in labels from dictionary click.echo("Combine assignment with count file...") - counts = pd.merge(assoc, counts, how="right", on="Barcode") - counts.Oligo.fillna("no_BC", inplace=True) - counts.rename(columns={"Oligo": "name"}, inplace=True) - # counts = counts[['name', 'Barcode', 'dna_count', 'rna_count']] - statistic[["unknown barcodes dna/rna"]] = counts[counts.name == "no_BC"].shape[0] + counts = pd.merge(assoc, counts, how="right", on="barcode") + counts['oligo'] = counts['oligo'].fillna("no_BC") + counts.rename(columns={"oligo": "oligo_name"}, inplace=True) + # counts = counts[['oligo_name', 'barcode', 'dna_count', 'rna_count']] + statistic[["unknown barcodes dna/rna"]] = counts[counts.oligo_name == "no_BC"].shape[0] statistic["matched barcodes"] = ( statistic["barcodes dna/rna"] - statistic["unknown barcodes dna/rna"] @@ -133,18 +191,21 @@ def cli( statistic["matched barcodes"] / statistic["barcodes dna/rna"] ) * 100.0 - if bc_output_file: - counts.to_csv(bc_output_file, index=False, sep="\t", compression="gzip") + # add pseudocount if needed: + counts["dna_count"] = counts["dna_count"] + pseudocountDNA + counts["rna_count"] = counts["rna_count"] + pseudocountRNA - # remove Barcorde. Not needed anymore - counts.drop(["Barcode"], axis=1, inplace=True) # number of DNA and RNA counts total_dna_counts = sum(counts["dna_count"]) total_rna_counts = sum(counts["rna_count"]) + unknown_dna_counts = sum(counts[counts.oligo_name == "no_BC"]["dna_count"]) + unknown_rna_counts = sum(counts[counts.oligo_name == "no_BC"]["rna_count"]) + statistic["total dna counts"] = total_dna_counts statistic["total rna counts"] = total_rna_counts + statistic["avg dna counts per bc"] = ( statistic["total dna counts"] / statistic["barcodes dna/rna"] ) @@ -152,64 +213,73 @@ def cli( statistic["total rna counts"] / statistic["barcodes dna/rna"] ) - grouped_label = counts.groupby("name").agg( + if not normalize_with_not_assigned: + counts = counts[counts.oligo_name != "no_BC"] + total_dna_counts -= unknown_dna_counts + total_rna_counts -= unknown_rna_counts + + # outlier removal + + if outlier_detection_method: + if outlier_detection_method == "ratio_mad": + counts, removed_barcodes = outlier_removal_by_mad(counts, ratio_mad_n_bins, ratio_mad_times) + elif outlier_detection_method == "rna_counts_zscore": + counts, removed_barcodes = outlier_removal_by_rna_zscore(counts, rna_zscore_times) + else: + raise ValueError("Outlier removal method not recognized") + else: + removed_barcodes = pd.DataFrame({'barcode' : []}) + + if removed_barcodes_file: + removed_barcodes.to_csv(removed_barcodes_file, columns=["barcode"], header=False, index=False) + + statistic["barcode outlier removed"] = removed_barcodes.shape[0] + + # BC output file + if bc_output_file: + counts[["barcode", "oligo_name","dna_count","rna_count"]].to_csv(bc_output_file, index=False, sep="\t", compression="gzip") + + # remove Barcorde. Not needed anymore + counts.drop(["barcode"], axis=1, inplace=True) + + # group by oligo name + grouped_label = counts.groupby("oligo_name").agg( {"dna_count": ["sum", "count"], "rna_count": ["sum", "count"]} ) + grouped_label.reset_index(inplace=True) - # add pseudo BC counts to total number of counts if needed - total_dna_counts = ( - total_dna_counts + sum(grouped_label.dna_count["count"]) * pseudocountDNA - ) - total_rna_counts = ( - total_rna_counts + sum(grouped_label.rna_count["count"]) * pseudocountRNA - ) - output = pd.DataFrame() click.echo(grouped_label.head()) - output["name"] = grouped_label["name"] + output["oligo_name"] = grouped_label["oligo_name"] output["dna_counts"] = grouped_label.dna_count["sum"] output["rna_counts"] = grouped_label.rna_count["sum"] - statistic["oligos dna/rna"] = len(grouped_label) - 1 + statistic["oligos dna/rna"] = len(grouped_label) statistic["avg dna/rna barcodes per oligo"] = ( - sum(grouped_label[grouped_label["name"] != "no_BC"].dna_count["count"]) + sum(grouped_label[grouped_label["oligo_name"] != "no_BC"].dna_count["count"]) / statistic["oligos dna/rna"] ) - # scaling = 10**min([len(str(total_dna_counts))-1,len(str(total_rna_counts))-1]) - scaling = 10**6 output["dna_normalized"] = ( - ( - ( - grouped_label.dna_count["sum"] - + pseudocountDNA * grouped_label.dna_count["count"] - ) - / grouped_label.dna_count["count"] - ) + ( grouped_label.dna_count["sum"]/ grouped_label.dna_count["count"] ) / total_dna_counts * scaling ) output["rna_normalized"] = ( - ( - ( - grouped_label.rna_count["sum"] - + pseudocountRNA * grouped_label.rna_count["count"] - ) - / grouped_label.rna_count["count"] - ) + ( grouped_label.rna_count["sum"] / grouped_label.rna_count["count"] ) / total_rna_counts * scaling ) output["ratio"] = output["rna_normalized"] / output["dna_normalized"] - output["log2"] = np.log2(output.ratio) + output["log2FoldChange"] = np.log2(output.ratio) - output["n_obs_bc"] = grouped_label.dna_count["count"] + output["n_bc"] = grouped_label.dna_count["count"] click.echo(output_file) @@ -217,6 +287,40 @@ def cli( statistic.to_csv(statistic_file, index=False, sep="\t", compression="gzip") +def outlier_removal_by_rna_zscore(df, times_zscore = 3): + df["mean"] = df.groupby('oligo_name')['rna_count'].transform('mean') + + df["std"] = df.groupby('oligo_name')['rna_count'].transform('std') + df["rna_z_scores"] = (df["rna_count"] - df["mean"]) / df["std"] + + m = df.rna_z_scores.abs() <= times_zscore + barcodes_removed = df[~m].barcode + df = df[m] + return df[m], barcodes_removed + +def outlier_removal_by_mad(df, n_bins = 20, times_mad = 5): + # df = df[df.oligo_name != "no_BC"] + # Calculate ratio, ratio_med, ratio_diff, and mad + df['ratio'] = np.log2(df["dna_count"] / df["rna_count"]) + df['ratio_med'] = df.groupby('oligo_name')['ratio'].transform('median') + df['ratio_diff'] = df['ratio'] - df['ratio_med'] + + # Calculate quantiles within n_bins + qs = np.unique(np.quantile(np.log10(df['rna_count']), np.arange(0, n_bins) / n_bins)) + + # Create bins based on rna_count + df['bin'] = pd.cut(np.log10(df['rna_count']), bins=qs, include_lowest=True, labels=[str(i) for i in range(0, len(qs)-1)]) + # Filter based on ratio_diff and mad + df['ratio_diff_med'] = df.groupby('bin', observed=True)['ratio_diff'].transform('median') + df['ratio_diff_med_dist'] = np.abs(df['ratio_diff'] - df['ratio_diff_med']) + df['mad'] = df.groupby('bin', observed=True)['ratio_diff_med_dist'].transform('median') + # df['mad'] = df.groupby('bin', observed=True)['ratio_diff'].transform(lambda x: np.median(np.abs(x - np.median(x)))) + + m = df.ratio_diff <= times_mad * df.mad + barcodes_removed = df[~m].barcode + df = df[m] + + return df, barcodes_removed if __name__ == "__main__": cli() diff --git a/workflow/scripts/count/merge_replicates_barcode_counts.py b/workflow/scripts/count/merge_replicates_barcode_counts.py index a5f79056..a5495264 100644 --- a/workflow/scripts/count/merge_replicates_barcode_counts.py +++ b/workflow/scripts/count/merge_replicates_barcode_counts.py @@ -29,82 +29,93 @@ ) @click.option( "--output", + "output_threshold_file", + required=True, + type=click.Path(writable=True), + help="Output file.", +) +@click.option( + "--output-threshold", "output_file", required=True, type=click.Path(writable=True), help="Output file.", ) -def cli(counts_files, bc_thresh, replicates, output_file): - """ - Merge the associated barcode count files of all replicates. - """ - - # ensure there are as many replicates as there are files - if len(replicates) != len(counts_files): - raise ( - click.BadParameter( - "Number of replicates ({}) doesn't equal the number of files ({}).".format( - len(replicates), len(counts_files) - ) - ) - ) +def cli(counts_files, bc_thresh, replicates, output_threshold_file, output_file): + """ + Merge the associated barcode count files of all replicates. + """ - # check if every file exists - for file in counts_files: - if not os.path.exists(file): - raise (click.BadParameter("{}: file not found".format(file))) + # ensure there are as many replicates as there are files + if len(replicates) != len(counts_files): + raise ( + click.BadParameter( + "Number of replicates ({}) doesn't equal the number of files ({}).".format( + len(replicates), len(counts_files) + ) + ) + ) - all_reps = [] - for file in counts_files: - curr_rep = -1 - # find the replicate name of the current file - for rep in replicates: - if rep in os.path.basename(file).split("_")[1]: - curr_rep = rep - break - if curr_rep == -1: - raise (click.BadParameter("{}: incorrect file".format(file))) - df = pd.read_csv(file, sep="\t") - df['replicate'] = curr_rep - all_reps.append(df) + # check if every file exists + for file in counts_files: + if not os.path.exists(file): + raise (click.BadParameter("{}: file not found".format(file))) - df = pd.concat(all_reps) - df = df[df["name"] != "no_BC"] + all_reps = [] + for file in counts_files: + curr_rep = -1 + # find the replicate name of the current file + for rep in replicates: + if rep in os.path.basename(file).split("_")[1]: + curr_rep = rep + break + if curr_rep == -1: + raise (click.BadParameter("{}: incorrect file".format(file))) + df = pd.read_csv(file, sep="\t") + df['replicate'] = curr_rep + all_reps.append(df) - # only keep oligo's with a number of barcodes of at least the given threshold - df_filtered = df.groupby(["name", "replicate"]).filter(lambda x: len(x) >= bc_thresh) + df = pd.concat(all_reps) + df = df[df["oligo_name"] != "no_BC"] - # pivot table to make a dna and rna count column for every replicate - df_filtered = df_filtered.pivot_table( - values=["dna_count", "rna_count"], - index=["Barcode", "name"], - columns="replicate", - aggfunc='first' - ) - df_filtered = df_filtered.sort_values("name") + def pivot_table(df, replicates): + # pivot table to make a dna and rna count column for every replicate + df = df.pivot_table( + values=["dna_count", "rna_count"], + index=["barcode", "oligo_name"], + columns="replicate", + aggfunc='first' + ) + df = df.sort_values("oligo_name") - # order columns to have dna then rna count of each replicate - col_order = sum( - [ - ["dna_count_" + rep, "rna_count_" + rep] - for rep in replicates - ], - [], - ) + # order columns to have dna then rna count of each replicate + col_order = sum( + [ + ["dna_count_" + rep, "rna_count_" + rep] + for rep in replicates + ], + [], + ) - df_filtered = df_filtered.reset_index() + df = df.reset_index() - df_filtered.columns = ['_'.join(col).strip() if col[1] else col[0] for col in df_filtered.columns.values] - - df_filtered = df_filtered[["Barcode", "name"] + col_order] + df.columns = ['_'.join(col).strip() if col[1] else col[0] for col in df.columns.values] + + df = df[["barcode", "oligo_name"] + col_order] - for col in col_order: - df_filtered[col] = df_filtered[col].astype('Int64') + for col in col_order: + df[col] = df[col].astype('Int32') + + return df - # write to output file - df_filtered.to_csv(output_file, sep="\t", index=False, compression="gzip") + # only keep oligo's with a number of barcodes of at least the given threshold + df_filtered = df.groupby(["oligo_name", "replicate"]).filter(lambda x: len(x) >= bc_thresh) + + # write to output file + pivot_table(df_filtered,replicates).to_csv(output_threshold_file, sep="\t", index=False, compression="gzip") + pivot_table(df,replicates).to_csv(output_file, sep="\t", index=False, compression="gzip") if __name__ == "__main__": - cli() + cli() diff --git a/workflow/scripts/count/plot_perInsertCounts_correlation.R b/workflow/scripts/count/plot_perInsertCounts_correlation.R index 6042e2d1..9e07dd73 100644 --- a/workflow/scripts/count/plot_perInsertCounts_correlation.R +++ b/workflow/scripts/count/plot_perInsertCounts_correlation.R @@ -64,7 +64,7 @@ if ("label" %in% names(opt)) { header = TRUE, stringsAsFactors = FALSE )) - colnames(label_f) <- c("name", "label") + colnames(label_f) <- c("oligo_name", "label") use_labels <- TRUE } else { use_labels <- FALSE @@ -281,7 +281,7 @@ read_data <- function(file) { header = TRUE, stringsAsFactors = FALSE ) %>% - filter(name != "no_BC") %>% + filter(oligo_name != "no_BC") %>% mutate( dna_normalized_log2 = log2(dna_normalized), rna_normalized_log2 = log2(rna_normalized), @@ -306,7 +306,7 @@ for (n in 1:(data %>% nrow())) { if (use_labels) { all <- all %>% - left_join(label_f, by = c("name")) %>% + left_join(label_f, by = c("oligo_name")) %>% mutate(label = replace_na(label, "NA")) } else { if (nrow(all) > 0) { # can be 0 when no BCs are assigned @@ -339,13 +339,13 @@ if (data %>% nrow() > 1 && nrow(all) > 1) { n_oligos_r2 <- data2 %>% nrow() n_oligos_r1_thres <- data1 %>% - filter(n_obs_bc >= thresh) %>% + filter(n_bc >= thresh) %>% nrow() n_oligos_r2_thres <- data2 %>% - filter(n_obs_bc >= thresh) %>% + filter(n_bc >= thresh) %>% nrow() - res <- data1 %>% inner_join(data2, by = c("name")) + res <- data1 %>% inner_join(data2, by = c("oligo_name")) plots_correlations_dna[[i]] <- plot_correlations_dna(res, cond, r1, r2, "pairwise") @@ -369,7 +369,7 @@ if (data %>% nrow() > 1 && nrow(all) > 1) { # Min Threshold res <- - res %>% filter(n_obs_bc.x >= thresh, n_obs_bc.y >= thresh) + res %>% filter(n_bc.x >= thresh, n_bc.y >= thresh) plots_cor_min_thresh_dna[[i]] <- plot_correlations_dna(res, cond, r1, r2, "pairwise_minThreshold") plots_cor_min_thresh_rna[[i]] <- diff --git a/workflow/scripts/count/plot_perInsertCounts_stats.R b/workflow/scripts/count/plot_perInsertCounts_stats.R index 93f68988..00528297 100644 --- a/workflow/scripts/count/plot_perInsertCounts_stats.R +++ b/workflow/scripts/count/plot_perInsertCounts_stats.R @@ -65,7 +65,7 @@ if ("label" %in% names(opt)) { header = TRUE, stringsAsFactors = FALSE )) - colnames(label_f) <- c("name", "label") + colnames(label_f) <- c("oligo_name", "label") use_labels <- TRUE } else { use_labels <- FALSE @@ -95,11 +95,10 @@ read_data <- function(file) { header = TRUE, stringsAsFactors = FALSE ) %>% - filter(name != "no_BC") %>% + filter(oligo_name != "no_BC") %>% mutate( dna_normalized_log2 = log2(dna_normalized), rna_normalized_log2 = log2(rna_normalized), - ratio_log2 = log2(ratio) ) return(data) } @@ -117,7 +116,7 @@ for (n in 1:(data %>% nrow())) { if (use_labels) { all <- all %>% - left_join(label_f, by = c("name")) %>% + left_join(label_f, by = c("oligo_name")) %>% mutate(label = replace_na(label, "NA")) } else { all$label <- "NA" @@ -128,7 +127,7 @@ print("Histogram plots, RNA/DNA correlation plots, Violin plots") plot_median_dna_rna_cor <- function(data) { data <- data %>% - group_by(name) %>% + group_by(oligo_name) %>% summarise(dna_normalized = median(log10(dna_normalized)), rna_normalized = median(log10(rna_normalized)), n = n()) data <- data %>% filter(n == length(replicates)) p <- ggplot(data, aes(x = dna_normalized, y = rna_normalized)) + @@ -151,7 +150,7 @@ plot_median_dna_rna_cor <- function(data) { } plot_group_bc_per_insert <- function(data) { - bp <- ggplot(data, aes(x = label, y = log2, fill = label)) + + bp <- ggplot(data, aes(x = label, y = log2FoldChange, fill = label)) + geom_violin() + geom_boxplot(width = 0.1, fill = "white") + xlab("insert") + @@ -180,7 +179,7 @@ ggsave(sprintf("%s_dna_vs_rna.png", outdir), width = 10, height = 10 ) ggsave(sprintf("%s_dna_vs_rna_minThreshold.png", outdir), - plot_median_dna_rna_cor(all %>% filter(n_obs_bc >= thresh)), + plot_median_dna_rna_cor(all %>% filter(n_bc >= thresh)), width = 10, height = 10 ) @@ -196,9 +195,9 @@ for (n in 1:(data %>% nrow())) { assigned_counts <- all %>% filter(replicate == rep) # Histograms - intercept <- median(assigned_counts$n_obs_bc) + intercept <- median(assigned_counts$n_bc) hist_plot_list[[n]] <- - ggplot(assigned_counts, aes(x = n_obs_bc)) + + ggplot(assigned_counts, aes(x = n_bc)) + geom_histogram(bins = 300) + geom_vline(xintercept = intercept, colour = "red") + xlim(0, 300) + @@ -210,7 +209,7 @@ for (n in 1:(data %>% nrow())) { ggtitle(paste("replicate", rep, sep = " ")) box_plot_insert_thresh_list[[n]] <- - plot_group_bc_per_insert(assigned_counts %>% filter(n_obs_bc >= thresh)) + + plot_group_bc_per_insert(assigned_counts %>% filter(n_bc >= thresh)) + ggtitle(paste("replicate", rep, sep = " ")) } diff --git a/workflow/scripts/variants/correlateVariantTables.py b/workflow/scripts/variants/correlateVariantTables.py index a1d69069..b79e9f33 100644 --- a/workflow/scripts/variants/correlateVariantTables.py +++ b/workflow/scripts/variants/correlateVariantTables.py @@ -32,7 +32,7 @@ def cli(condition, variants, bc_threshold, output_file): def filterOnThreshold(variants, threshold): - return(variants.query('n_obs_bc_ALT >= %d & n_obs_bc_REF >= %d' % (threshold, threshold))) + return(variants.query('n_bc_ALT >= %d & n_bc_REF >= %d' % (threshold, threshold))) output = pd.DataFrame() for i in range(len(variants)): diff --git a/workflow/scripts/variants/generateMasterVariantTable.py b/workflow/scripts/variants/generateMasterVariantTable.py index f77c90f0..0828466c 100644 --- a/workflow/scripts/variants/generateMasterVariantTable.py +++ b/workflow/scripts/variants/generateMasterVariantTable.py @@ -49,18 +49,18 @@ def cli(input_files, minRNACounts, minDNACounts, output_file): click.echo("Create new expression values...") df = df.groupby(['ID', 'REF', 'ALT']).agg(dna_counts_REF=('dna_counts_REF', sum), rna_counts_REF=('rna_counts_REF', sum), - n_obs_bc_REF=('n_obs_bc_REF', sum), + n_bc_REF=('n_bc_REF', sum), dna_counts_ALT=('dna_counts_ALT', sum), rna_counts_ALT=('rna_counts_ALT', sum), - n_obs_bc_ALT=('n_obs_bc_ALT', sum) + n_bc_ALT=('n_bc_ALT', sum) ).reset_index() scaling = 10**6 - df_total = df[['dna_counts_REF', 'rna_counts_REF', 'n_obs_bc_REF', - 'dna_counts_ALT', 'rna_counts_ALT', 'n_obs_bc_ALT']].sum() + df_total = df[['dna_counts_REF', 'rna_counts_REF', 'n_bc_REF', + 'dna_counts_ALT', 'rna_counts_ALT', 'n_bc_ALT']].sum() - total_bc = df_total['n_obs_bc_REF'] + df_total['n_obs_bc_ALT'] + total_bc = df_total['n_bc_REF'] + df_total['n_bc_ALT'] total_dna = df_total['dna_counts_REF'] + df_total['dna_counts_ALT'] + total_bc * pseudocountDNA total_rna = df_total['rna_counts_REF'] + df_total['rna_counts_ALT'] + total_bc * pseudocountRNA @@ -68,8 +68,8 @@ def cli(input_files, minRNACounts, minDNACounts, output_file): def normalize(df, total, count_type, ref_alt, pseudocount, scaling): return((( - df["%s_counts_%s" % (count_type, ref_alt)] + pseudocount * df['n_obs_bc_%s' % ref_alt] - )/df['n_obs_bc_%s' % ref_alt])/total*scaling) + df["%s_counts_%s" % (count_type, ref_alt)] + pseudocount * df['n_bc_%s' % ref_alt] + )/df['n_bc_%s' % ref_alt])/total*scaling) for ref_alt in ["REF", "ALT"]: for count_type in ["dna", "rna"]: @@ -80,20 +80,20 @@ def normalize(df, total, count_type, ref_alt, pseudocount, scaling): df, total, count_type, ref_alt, pseudocount, scaling) df['ratio_%s' % ref_alt] = df['rna_normalized_%s' % ref_alt] / df['dna_normalized_%s' % ref_alt] - df['log2_%s' % ref_alt] = np.log2(df['ratio_%s' % ref_alt]) + df['log2FoldChange_%s' % ref_alt] = np.log2(df['ratio_%s' % ref_alt]) - df["log2_expression"] = np.log2(df['ratio_ALT']/df['ratio_REF']) + df["log2FoldChange_expression"] = np.log2(df['ratio_ALT']/df['ratio_REF']) # fill NA and set correct output types df.fillna(0, inplace=True) - df = df.astype(dtype={'dna_counts_REF': 'int64', 'rna_counts_REF': 'int64', 'n_obs_bc_REF': 'int64', - 'dna_counts_ALT': 'int64', 'rna_counts_ALT': 'int64', 'n_obs_bc_ALT': 'int64'}, copy=False) + df = df.astype(dtype={'dna_counts_REF': 'int64', 'rna_counts_REF': 'int64', 'n_bc_REF': 'int64', + 'dna_counts_ALT': 'int64', 'rna_counts_ALT': 'int64', 'n_bc_ALT': 'int64'}, copy=False) df = df.reindex(columns=["ID", "REF", "ALT", "dna_counts_REF", "rna_counts_REF", - "dna_normalized_REF", "rna_normalized_REF", "ratio_REF", "log2_REF", - "n_obs_bc_REF", "dna_counts_ALT", "rna_counts_ALT", + "dna_normalized_REF", "rna_normalized_REF", "ratio_REF", "log2FoldChange_REF", + "n_bc_REF", "dna_counts_ALT", "rna_counts_ALT", "dna_normalized_ALT", "rna_normalized_ALT", "ratio_ALT", - "log2_ALT", "n_obs_bc_ALT", "log2_expression"]) + "log2FoldChange_ALT", "n_bc_ALT", "log2FoldChange_expression"]) # write output click.echo("Write files...") diff --git a/workflow/scripts/variants/generateVariantTable.py b/workflow/scripts/variants/generateVariantTable.py index fa2d719a..bf465248 100644 --- a/workflow/scripts/variants/generateVariantTable.py +++ b/workflow/scripts/variants/generateVariantTable.py @@ -32,18 +32,18 @@ def cli(counts_file, declaration_file, output_file): # get count df click.echo("Read count file...") - # expected column names: name dna_counts rna_counts dna_normalized rna_normalized ratio log2 n_obs_bc + # expected column names: name dna_counts rna_counts dna_normalized rna_normalized ratio log2FoldChangeFoldChange n_bc counts = pd.read_csv(counts_file, header=0, sep="\t", index_col=0) # join ref with index of counts output = declaration.join(counts, on='REF') # join again for the alt columns output = output.join(counts, on='ALT', lsuffix='_REF', rsuffix='_ALT') - output["log2_expression"] = np.log2(output['ratio_ALT']/output['ratio_REF']) + output["log2FoldChange_expression"] = np.log2(output['ratio_ALT']/output['ratio_REF']) # fill NA and set correct output types output.fillna(0, inplace=True) - output = output.astype(dtype={'dna_counts_REF': 'int64', 'rna_counts_REF': 'int64', 'n_obs_bc_REF': 'int64', - 'dna_counts_ALT': 'int64', 'rna_counts_ALT': 'int64', 'n_obs_bc_ALT': 'int64'}, + output = output.astype(dtype={'dna_counts_REF': 'int64', 'rna_counts_REF': 'int64', 'n_bc_REF': 'int64', + 'dna_counts_ALT': 'int64', 'rna_counts_ALT': 'int64', 'n_bc_ALT': 'int64'}, copy=False) # write output