diff --git a/.dockerignore b/.dockerignore new file mode 100644 index 000000000..0146c506a --- /dev/null +++ b/.dockerignore @@ -0,0 +1,5 @@ +.git +.venv +.ruff_cache +db +docs diff --git a/.editorconfig b/.editorconfig new file mode 100644 index 000000000..74d10586f --- /dev/null +++ b/.editorconfig @@ -0,0 +1,12 @@ +root = true + +[*] +charset = utf-8 +end_of_line = lf +insert_final_newline = true +trim_trailing_whitespace = true +max_line_length = 120 + +[*.py] +indent_size = 4 +indent_style = space diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml new file mode 100644 index 000000000..427ed3838 --- /dev/null +++ b/.pre-commit-config.yaml @@ -0,0 +1,57 @@ +fail_fast: false +default_language_version: + python: python3 +default_stages: + - commit + - push +minimum_pre_commit_version: 2.16.0 +repos: + - repo: https://github.com/psf/black + rev: "23.11.0" + hooks: + - id: black + - repo: https://github.com/asottile/blacken-docs + rev: 1.16.0 + hooks: + - id: blacken-docs + - repo: https://github.com/pre-commit/mirrors-prettier + rev: v3.1.0 + hooks: + - id: prettier + # Newer versions of node don't work on systems that have an older version of GLIBC + # (in particular Ubuntu 18.04 and Centos 7) + # EOL of Centos 7 is in 2024-06, we can probably get rid of this then. + # See https://github.com/scverse/cookiecutter-scverse/issues/143 and + # https://github.com/jupyterlab/jupyterlab/issues/12675 + language_version: "17.9.1" + - repo: https://github.com/astral-sh/ruff-pre-commit + rev: v0.1.5 + hooks: + - id: ruff + args: [--fix, --exit-non-zero-on-fix] + - id: ruff-format + - repo: https://github.com/pre-commit/pre-commit-hooks + rev: v4.5.0 + hooks: + - id: detect-private-key + - id: check-ast + - id: end-of-file-fixer + - id: mixed-line-ending + args: [--fix=lf] + - id: trailing-whitespace + - id: check-case-conflict + - id: check-added-large-files + - id: check-toml + - id: check-yaml + - id: check-merge-conflict + - id: no-commit-to-branch + args: ["--branch=master", "--branch==main"] + - repo: local + hooks: + - id: forbid-to-commit + name: Don't commit rej files + entry: | + Cannot commit .rej files. These indicate merge conflicts that arise during automated template updates. + Fix the merge conflicts manually and remove the .rej files. + language: fail + files: '.*\.rej$' diff --git a/.readthedocs.yaml b/.readthedocs.yaml new file mode 100644 index 000000000..ed8489f93 --- /dev/null +++ b/.readthedocs.yaml @@ -0,0 +1,16 @@ +# https://docs.readthedocs.io/en/stable/config-file/v2.html +version: 2 +build: + os: ubuntu-22.04 + tools: + python: "3.10" +sphinx: + configuration: docs/conf.py + # disable this for more lenient docs builds + fail_on_warning: false +python: + install: + - method: pip + path: . + extra_requirements: + - doc diff --git a/README.md b/README.md index a74f4d486..8069c99cb 100644 --- a/README.md +++ b/README.md @@ -1,433 +1,36 @@ - -run_dbcan4 -======================== - -Status ----- [![dbcan](https://github.com/linnabrown/run_dbcan/actions/workflows/ci.yml/badge.svg?branch=master)](https://github.com/linnabrown/run_dbcan/actions/workflows/ci.yml) [![Package status](https://anaconda.org/bioconda/dbcan/badges/version.svg)](https://anaconda.org/bioconda/dbcan) -[![GitHub license](https://img.shields.io/badge/license-GUN3.0-blue.svg)](https://github.com/linnabrown/run_dbcan/blob/master/LICENSE) +[![GitHub license](https://img.shields.io/badge/license-GNU3.0-blue.svg)](https://github.com/linnabrown/run_dbcan/blob/master/LICENSE) [![Platform](https://anaconda.org/bioconda/dbcan/badges/platforms.svg)](https://anaconda.org/bioconda/dbcan/badges/platforms.svg) [![GitHub downloads](https://anaconda.org/bioconda/dbcan/badges/downloads.svg)](https://anaconda.org/bioconda/dbcan) [![GitHub versions](https://img.shields.io/pypi/pyversions/dbcan.svg)](https://anaconda.org/bioconda/dbcan) - - -A standalone tool of [dbCAN3 web server](http://bcb.unl.edu/dbCAN2/). - -Update Info ---- -run_dbcan 4.0.0 is released. -1. CAZyme substrate prediction based on dbCAN-sub ; - -2. CGC substrate prediction based on dbCAN-PUL searching and [dbCAN-sub](https://bcb.unl.edu/dbCAN_sub/) majority voting. For CGC substrate prediction, please see our [dbCAN-seq update paper](https://academic.oup.com/nar/article/51/D1/D557/6833251?login=false) for details. With these new functions (esp. the dbCAN-sub search), run_dbcan4.0 is now slower to get the result back to you. Please be patient! - -3. See https://github.com/linnabrown/run_dbcan/issues/127 for some explanation of different output files for substrate predictions -4. **Please split your files if your input contains > 1 million proteins**. - -**Please update all of the databases**. - -[Previous update information](https://github.com/linnabrown/run_dbcan/wiki/Update-information-Archive) - -Function ----- -- Accepts user input -- Predicts genes if needed -- Runs input against HMMER, DIAMOND, and dbCAN_sub -- Optionally predicts CGCs with CGCFinder - -Support Platform ------ -Linux(Ubuntu, CentOS), MacOS - -Installation via Bioconda ------ -1. Please install [Anoconda](https://www.anaconda.com) first. - -2. Install [NCBI Blast+](https://blast.ncbi.nlm.nih.gov/Blast.cgi?CMD=Web&PAGE_TYPE=BlastDocs&DOC_TYPE=Download). - -3. Create virtual environment with dbcan and activate the virtual environment. - -``` -conda create -n run_dbcan python=3.8 dbcan -c conda-forge -c bioconda -conda activate run_dbcan -``` - -If you are old user, just update the conda virtual environment `run_dbcan` via running `conda install dbcan`. - -4. Database Installation. -``` -test -d db || mkdir db -cd db \ - && wget http://bcb.unl.edu/dbCAN2/download/Databases/fam-substrate-mapping-08012023.tsv \ - && wget http://bcb.unl.edu/dbCAN2/download/Databases/PUL_12112023.faa && mv PUL_12112023.faa PUL.faa && makeblastdb -in PUL.faa -dbtype prot \ - && wget http://bcb.unl.edu/dbCAN2/download/Databases/dbCAN-PUL_12-12-2023.xlsx \ - && wget http://bcb.unl.edu/dbCAN2/download/Databases/dbCAN-PUL_12-12-2023.txt \ - && wget http://bcb.unl.edu/dbCAN2/download/Databases/dbCAN-PUL.tar.gz && tar xvf dbCAN-PUL.tar.gz \ - && wget https://bcb.unl.edu/dbCAN2/download/Databases/dbCAN_sub.hmm && hmmpress dbCAN_sub.hmm \ - && wget https://bcb.unl.edu/dbCAN2/download/Databases/V12/CAZyDB.07262023.fa && diamond makedb --in CAZyDB.07262023.fa -d CAZy \ - && wget https://bcb.unl.edu/dbCAN2/download/Databases/V12/dbCAN-HMMdb-V12.txt && mv dbCAN-HMMdb-V12.txt dbCAN.txt && hmmpress dbCAN.txt \ - && wget https://bcb.unl.edu/dbCAN2/download/Databases/V12/tcdb.fa && diamond makedb --in tcdb.fa -d tcdb \ - && wget http://bcb.unl.edu/dbCAN2/download/Databases/V12/tf-1.hmm && hmmpress tf-1.hmm \ - && wget http://bcb.unl.edu/dbCAN2/download/Databases/V12/tf-2.hmm && hmmpress tf-2.hmm \ - && wget https://bcb.unl.edu/dbCAN2/download/Databases/V12/stp.hmm && hmmpress stp.hmm \ - && cd ../ && wget http://bcb.unl.edu/dbCAN2/download/Samples/EscheriaColiK12MG1655.fna \ - && wget http://bcb.unl.edu/dbCAN2/download/Samples/EscheriaColiK12MG1655.faa \ - && wget http://bcb.unl.edu/dbCAN2/download/Samples/EscheriaColiK12MG1655.gff - -``` -5. (Optional) SignalP Installation. -Our program include Signalp Petitide prediction with SignalP. Make sure to set `use_signalP=True` and *have to* obtain your own academic license of SignalP and download it from [here](https://services.healthtech.dtu.dk/service.php?SignalP-4.1), and then move the perl file from the tarball file (signalp-4.1g.Linux.tar.gz) into `/usr/bin/signalp` by yourself. Following statement is singalP-4.1 installation instruction. - -Decompress signalp-4.1g.Linux.tar.gz than open the directory - -``` -tar -xvf signalp-4.1g.Linux.tar.gz && cd signalp-4.1 -``` - -Then you can find those files/directories located in `signalp-4.1` directory -``` -(base) lehuang@lehuang:~/Downloads/signalp-4.1$ ls -bin lib signalp signalp.1 signalp-4.1.readme syn test -``` - -*signalp* is the perl file that you will use in your program -Edit the paragraph labeled "GENERAL SETTINGS, CUSTOMIZE ..." in the top of - the file 'signalp'. The following twovmandatory variables need to be set: - - **SIGNALP** full path to the signalp-4.1 directory on your system - **outputDir** where to store temporary files (writable to all users) - **MAX_ALLOWED_ENTRIES** the number of input sequences allowed per run. -``` -Here is the example for me to change line 13, line 17 and line 20 in `singalp` file. I suggest you to set MAX_ALLOWED_ENTRIES as 100000 -############################################################################### -# GENERAL SETTINGS: CUSTOMIZE TO YOUR SITE -############################################################################### - -# full path to the signalp-4.1 directory on your system (mandatory) -BEGIN { - $ENV{SIGNALP} = '/home/lehuang/Downloads/signalp-4.1'; -} - -# determine where to store temporary files (must be writable to all users) -my $outputDir = "/home/lehuang/Downloads/signalp-4.1/output"; - -# max number of sequences per run (any number can be handled) -my $MAX_ALLOWED_ENTRIES=100000; -``` - -And then, use this command: - -``` -sudo cp signalp /usr/bin/signalp -sudo chmod 755 /usr/bin/signalp -``` -If you don't have the permission to access `/usr/bin`, you can use the parameter `-sp` or `--signalP_path` to indicate your `signalp` file path in the run_dbcan program. Please see the step 6. -6. Check Program. -``` -run_dbcan EscheriaColiK12MG1655.fna prok --out_dir output_EscheriaColiK12MG1655 -``` - -If you want to run the code with SignalP -``` -run_dbcan EscheriaColiK12MG1655.fna prok --out_dir output_EscheriaColiK12MG1655 --use_signalP=TRUE - -``` -If you don't have the permission to access `/usr/bin` when running with signalP, you can use the parameter `-sp` or `--signalP_path` to indicate your `signalp` file path in the run_dbcan program. - -``` -run_dbcan EscheriaColiK12MG1655.fna prok --out_dir output_EscheriaColiK12MG1655 --use_signalP=TRUE -sp /home/lehuang/Downloads/signalp-4.1/signalp -``` -Installation via Docker ----- -1. Make sure docker is installed on your computer successfully. -2. Docker pull image -``` -docker pull haidyi/run_dbcan:latest -``` -3. Run. Mount `input sequence file` and `output directory` to the container. -``` -docker run --name -v : -it haidyi/run_dbcan:latest {protein,meta,prok} [params] --out_dir -``` - - - -REQUIREMENTS ----- - -**TOOLS** - ----- -P.S.: You do not need to download `CGCFinder` and `hmmscan-parser` because they are included in run_dbcan V4. If you use python package or docker, you don't need to download Prodigal because they includes these denpendencies. Otherwise we recommend you to install and copy them into `/usr/bin` as system application or add their path into system envrionmental profile. - - -[Python3]--Be sure to use python3, not python2 - -[DIAMOND](https://github.com/bbuchfink/diamond)-- Included in run_dbcan4. - -[HMMER](hmmer.org)--Included in run_dbcan4. - -[hmmscan-parser](https://github.com/linnabrown/run_dbcan/blob/master/hmmscan-parser.py)--This is included in run_dbcan4. - -dbCAN_sub--Included in run_dbcan4. - -[signalp](http://www.cbs.dtu.dk/services/SignalP/)--please download and install if you need. - -[Prodigal](https://github.com/hyattpd/Prodigal)--Included in run_dbcan4. - -[CGCFinder](https://github.com/linnabrown/run_dbcan/blob/master/dbcan/utils/CGCFinder.py)--Included in run_dbcan4. - -**DATABASES Installation (those are included in step4 Database Installation)** - ----- -[Databse](http://bcb.unl.edu/dbCAN2/download/Databases) -- Database Folder - -[CAZy.fa](http://bcb.unl.edu/dbCAN2/download/Databases/CAZyDB.09242021.fa)--use `diamond makedb --in CAZyDB.09242021.fa -d CAZy` - -[dbCAN_sub](http://bcb.unl.edu/dbCAN2/download/Databases/dbCAN_sub.hmm) --use `hmmpress dbCAN_sub.hmm` . - -[dbCAN-PUL](http://bcb.unl.edu/dbCAN2/download/Databases/dbCAN-PUL.tar.gz) The substrates files from dbCAN-PUL. - -[PUL](http://bcb.unl.edu/dbCAN2/download/Databases/PUL.faa)--The PUL sequences, use `makeblastdb -in PUL.faa -dbtype prot`. -[dbCAN-HMMdb-V11.txt](http://bcb.unl.edu/dbCAN2/download/Databases/V11/dbCAN-HMMdb-V11.txt)--First use `mv dbCAN-HMMdb-V11.txt dbCAN.txt`, then use `hmmpress dbCAN.txt` - -[tcdb.fa](http://bcb.unl.edu/dbCAN2/download/Databases/tcdb.fa)--use `diamond makedb --in tcdb.fa -d tcdb` - -[tf-1.hmm](http://bcb.unl.edu/dbCAN2/download/Databases/tf-1.hmm)--use `hmmpress tf-1.hmm` - -[tf-2.hmm](http://bcb.unl.edu/dbCAN2/download/Databases/tf-2.hmm)--use `hmmpress tf-2.hmm` - -[stp.hmm](http://bcb.unl.edu/dbCAN2/download/Databases/stp.hmm)--use `hmmpress stp.hmm` - - -Params ----- -``` -Required arguments: - inputFile User input file. Must be in FASTA format. - {protein,prok,meta} Type of sequence input. protein=proteome; prok=prokaryote; meta=metagenome - -optional arguments: - -h, --help show this help message and exit - --dbCANFile DBCANFILE - Indicate the file name of HMM database such as dbCAN.txt, please use the newest one from dbCAN2 website. - --dia_eval DIA_EVAL DIAMOND E Value - --dia_cpu DIA_CPU Number of CPU cores that DIAMOND is allowed to use - --hmm_eval HMM_EVAL HMMER E Value - --hmm_cov HMM_COV HMMER Coverage val - --hmm_cpu HMM_CPU Number of CPU cores that HMMER is allowed to use - --out_pre OUT_PRE Output files prefix - --out_dir OUT_DIR Output directory - --db_dir DB_DIR Database directory - --tools {hmmer,diamond,dbcansub,all} [{hmmer,diamond,dbcansub,all} ...], -t {hmmer,diamond,dbcansub,all} [{hmmer,diamond,dbcansub,all} ...] - Choose a combination of tools to run - --use_signalP USE_SIGNALP - Use signalP or not, remember, you need to setup signalP tool first. Because of signalP license, Docker version does not have signalP. - --signalP_path SIGNALP_PATH, -sp SIGNALP_PATH - The path for signalp. Default location is signalp - --gram {p,n,all}, -g {p,n,all} - Choose gram+(p) or gram-(n) for proteome/prokaryote nucleotide, which are params of SingalP, only if user use singalP - -v VERSION, --version VERSION - -dbCAN-sub parameters: - --dbcan_thread DBCAN_THREAD, -dt DBCAN_THREAD - --tf_eval TF_EVAL tf.hmm HMMER E Value - --tf_cov TF_COV tf.hmm HMMER Coverage val - --tf_cpu TF_CPU tf.hmm Number of CPU cores that HMMER is allowed to use - --stp_eval STP_EVAL stp.hmm HMMER E Value - --stp_cov STP_COV stp.hmm HMMER Coverage val - --stp_cpu STP_CPU stp.hmm Number of CPU cores that HMMER is allowed to use - -CGC_Finder parameters: - --cluster CLUSTER, -c CLUSTER - Predict CGCs via CGCFinder. This argument requires an auxillary locations file if a protein input is being used - --cgc_dis CGC_DIS CGCFinder Distance value - --cgc_sig_genes {tf,tp,stp,tp+tf,tp+stp,tf+stp,all} - CGCFinder Signature Genes value - -CGC_Substrate parameters: - --cgc_substrate run cgc substrate prediction? - --pul PUL dbCAN-PUL PUL.faa - -o OUT, --out OUT - -w WORKDIR, --workdir WORKDIR - -env ENV, --env ENV - -oecami, --oecami out eCAMI prediction intermediate result? - -odbcanpul, --odbcanpul - output dbCAN-PUL prediction intermediate result? - -dbCAN-PUL homologous searching parameters: - how to define homologous gene hits and PUL hits - - -upghn UNIQ_PUL_GENE_HIT_NUM, --uniq_pul_gene_hit_num UNIQ_PUL_GENE_HIT_NUM - -uqcgn UNIQ_QUERY_CGC_GENE_NUM, --uniq_query_cgc_gene_num UNIQ_QUERY_CGC_GENE_NUM - -cpn CAZYME_PAIR_NUM, --CAZyme_pair_num CAZYME_PAIR_NUM - -tpn TOTAL_PAIR_NUM, --total_pair_num TOTAL_PAIR_NUM - -ept EXTRA_PAIR_TYPE, --extra_pair_type EXTRA_PAIR_TYPE - None[TC-TC,STP-STP]. Some like sigunature hits - -eptn EXTRA_PAIR_TYPE_NUM, --extra_pair_type_num EXTRA_PAIR_TYPE_NUM - specify signature pair cutoff.1,2 - -iden IDENTITY_CUTOFF, --identity_cutoff IDENTITY_CUTOFF - identity to identify a homologous hit - -cov COVERAGE_CUTOFF, --coverage_cutoff COVERAGE_CUTOFF - query coverage cutoff to identify a homologous hit - -bsc BITSCORE_CUTOFF, --bitscore_cutoff BITSCORE_CUTOFF - bitscore cutoff to identify a homologous hit - -evalue EVALUE_CUTOFF, --evalue_cutoff EVALUE_CUTOFF - evalue cutoff to identify a homologous hit - -dbCAN-sub major voting parameters: - how to define dbsub hits and dbCAN-sub subfamily substrate - - -hmmcov HMMCOV, --hmmcov HMMCOV - -hmmevalue HMMEVALUE, --hmmevalue HMMEVALUE - -ndsc NUM_OF_DOMAINS_SUBSTRATE_CUTOFF, --num_of_domains_substrate_cutoff NUM_OF_DOMAINS_SUBSTRATE_CUTOFF - define how many domains share substrates in a CGC, one protein may include several subfamily domains. - -npsc NUM_OF_PROTEIN_SUBSTRATE_CUTOFF, --num_of_protein_substrate_cutoff NUM_OF_PROTEIN_SUBSTRATE_CUTOFF - define how many sequences share substrates in a CGC, one protein may include several subfamily domains. - -subs SUBSTRATE_SCORS, --substrate_scors SUBSTRATE_SCORS - each cgc contains with substrate must more than this value -``` - -RUN & OUTPUT ----- -Use following command to run the program. -``` -run_dbcan [inputFile] [inputType] [-c AuxillaryFile] [-t Tools] etc. -``` - -Several files will be produced via `run_dbcan`. They are as follows: - - uniInput - The unified input file for the rest of the tools - (created by prodigal if a nucleotide sequence was used) - - dbsub.out - the output from the dbCAN_sub run - - diamond.out - the output from the diamond blast - - hmmer.out - the output from the hmmer run - - tf.out - the output from the diamond blast predicting TF's for CGCFinder - - tc.out - the output from the diamond blast predicting TC's for CGCFinder - - cgc.gff - GFF input file for CGCFinder - - cgc.out - ouput from the CGCFinder run - - cgc_standard.out - simplified version of cgc.out, which contains columns: - CGC_id (CAZyme Gene Cluster ID), - type (CAZyme, TC, null, etc.), - contig_id, - gene_id, - start, - end, - strand, - annotation - - overview.txt - Details the CAZyme predictions across the three tools with signalp results - - see https://github.com/linnabrown/run_dbcan/issues/127 for some explanation of different output files for substrate predictions - -EXAMPLE ----- - -An example setup is available in the example directory. Included in this directory are two FASTA sequences (one protein, one nucleotide). - -To run this example type, run: - -``` -run_dbcan EscheriaColiK12MG1655.fna prok --out_dir output_EscheriaColiK12MG1655 -``` - -or - -``` -run_dbcan EscheriaColiK12MG1655.faa protein --out_dir output_EscheriaColiK12MG1655 -``` - -To run the examples with CGCFinder turned on, run: -``` -run_dbcan EscheriaColiK12MG1655.fna prok -c cluster --out_dir output_EscheriaColiK12MG1655 -``` - -or - -``` -run_dbcan EscheriaColiK12MG1655.faa protein -c EscheriaColiK12MG1655.gff --out_dir output_EscheriaColiK12MG1655 -``` - -Notice that the protein command has a GFF file following the -c option. A GFF or BED format file with gene position information is required to run CGCFinder when using a protein input. - -If you have any questions, please feel free to contact with Dr. Yin (yanbin.yin@gmail.com or yyin@unl.edu) or me (Le Huang) on [Issue Dashboard](https://github.com/linnabrown/run_dbcan/issues). - - -Reference ----- - -This is the standalone version of dbCAN annotation tool for automated CAZyme annotation (known as run_dbCAN), written by Le Huang and Tanner Yohe. - -If you want to use our dbCAN3 webserver, please go to http://bcb.unl.edu/dbCAN2/. Please cite us: -*Jinfang Zheng, Qiwei Ge, Yuchen Yan, Xinpeng Zhang, Le Huang, Yanbin Yin, dbCAN3: automated carbohydrate-active enzyme and substrate annotation, Nucleic Acids Research, 2023;, gkad328, https://doi.org/10.1093/nar/gkad328* -``` -@article{10.1093/nar/gkad328, - author = {Zheng, Jinfang and Ge, Qiwei and Yan, Yuchen and Zhang, Xinpeng and Huang, Le and Yin, Yanbin}, - title = "{dbCAN3: automated carbohydrate-active enzyme and substrate annotation}", - journal = {Nucleic Acids Research}, - year = {2023}, - month = {05}, - issn = {0305-1048}, - doi = {10.1093/nar/gkad328}, - url = {https://doi.org/10.1093/nar/gkad328}, - note = {gkad328}, - eprint = {https://academic.oup.com/nar/advance-article-pdf/doi/10.1093/nar/gkad328/50150154/gkad328.pdf}, -} - -``` - - -If you use dbCAN standalone tool (run_dbcan) or/and our web server for publication, please cite us: +# run_dbcan - Standalone Tool of [dbCAN3](http://bcb.unl.edu/dbCAN2/) -*Han Zhang, Tanner Yohe, **Le Huang**, Sarah Entwistle, Peizhi Wu, Zhenglu Yang, Peter K Busk, Ying Xu, Yanbin Yin; -dbCAN2: a meta server for automated carbohydrate-active enzyme annotation, Nucleic Acids Research, -Volume 46, Issue W1, 2 July 2018, Pages W95–W101, https://doi.org/10.1093/nar/gky418* -``` -@article{doi:10.1093/nar/gky418, -author = {Zhang, Han and Yohe, Tanner and Huang, Le and Entwistle, Sarah and Wu, Peizhi and Yang, Zhenglu and Busk, Peter K and Xu, Ying and Yin, Yanbin}, -title = {dbCAN2: a meta server for automated carbohydrate-active enzyme annotation}, -journal = {Nucleic Acids Research}, -volume = {46}, -number = {W1}, -pages = {W95-W101}, -year = {2018}, -doi = {10.1093/nar/gky418}, -URL = {http://dx.doi.org/10.1093/nar/gky418}, -eprint = {/oup/backfile/content_public/journal/nar/46/w1/10.1093_nar_gky418/1/gky418.pdf} -} -``` +**run_dbcan** is the standalone version of the [dbCAN3](http://bcb.unl.edu/dbCAN2/) annotation tool for automated CAZyme annotation. This tool, known as `run_dbcan`, incorporates HMMER, Diamond, and dbCAN_sub for annotating CAZyme families, and integrates Cazyme Gene Clusters (CGCs) and substrate predictions. -If you want to use pre-computed bacterial CAZyme sequences/annotations directly, please go to http://bcb.unl.edu/dbCAN_seq/ and cite us: +For usage discussions, visit our [issue tracker](https://github.com/linnabrown/run_dbcan/issues). To learn more, read the [dbcan doc]. If you're interested in contributing, whether through issues or pull requests, please review our contribution guide. -**Le Huang**, Han Zhang, Peizhi Wu, Sarah Entwistle, Xueqiong Li, Tanner Yohe, Haidong Yi, Zhenglu Yang, Yanbin Yin; -dbCAN-seq: a database of carbohydrate-active enzyme (CAZyme) sequence and annotation, Nucleic Acids Research, -Volume 46, Issue D1, 4 January 2018, Pages D516–D521, https://doi.org/10.1093/nar/gkx894* -``` -@article{doi:10.1093/nar/gkx894, -author = {Huang, Le and Zhang, Han and Wu, Peizhi and Entwistle, Sarah and Li, Xueqiong and Yohe, Tanner and Yi, Haidong and Yang, Zhenglu and Yin, Yanbin}, -title = {dbCAN-seq: a database of carbohydrate-active enzyme (CAZyme) sequence and annotation}, -journal = {Nucleic Acids Research}, -volume = {46}, -number = {D1}, -pages = {D516-D521}, -year = {2018}, -doi = {10.1093/nar/gkx894}, -URL = {http://dx.doi.org/10.1093/nar/gkx894}, -eprint = {/oup/backfile/content_public/journal/nar/46/d1/10.1093_nar_gkx894/2/gkx894.pdf} -} -``` +## Reference +Please cite the following `dbCAN` publications if you use `run_dbcan` in your research: -## Commit History +> **dbCAN3: automated carbohydrate-active enzyme and substrate annotation** +> +> Jinfang Zheng, Qiwei Ge, Yuchen Yan, Xinpeng Zhang, Le Huang, Yanbin Yin, +> +> Nucleic Acids Research, 2023;, gkad328, doi: [10.1093/nar/gkad328](https://doi.org/10.1093/nar/gkad328). -[![Commit History Chart](https://commit-history-api.herokuapp.com/svg?repos=linnabrown/run_dbcan&type=Date)](https://the-commit-history.vercel.app/#linnabrown/run_dbcan&Date) +> **dbCAN2: a meta server for automated carbohydrate-active enzyme annotation** +> +> Han Zhang, Tanner Yohe, Le Huang, Sarah Entwistle, Peizhi Wu, Zhenglu Yang, Peter K Busk, Ying Xu, Yanbin Yin +> +> Nucleic Acids Research, Volume 46, Issue W1, 2 July 2018, Pages W95–W101, doi: [10.1093/nar/gky418](https://doi.org/10.1093/nar/gky418). +> **dbCAN-seq: a database of carbohydrate-active enzyme (CAZyme) sequence and annotation** +> +> Le Huang, Han Zhang, Peizhi Wu, Sarah Entwistle, Xueqiong Li, Tanner Yohe, Haidong Yi, Zhenglu Yang, Yanbin Yin +> +> Nucleic Acids Research, Volume 46, Issue D1, 4 January 2018, Pages D516–D521, doi: [10.1093/nar/gkx894\*](https://doi.org/10.1093/nar/gkx894*). +[dbcan doc]: https://dbcan.readthedocs.io diff --git a/conda-recipe/meta.yaml b/conda-recipe/meta.yaml index ad903c322..64db58219 100644 --- a/conda-recipe/meta.yaml +++ b/conda-recipe/meta.yaml @@ -42,6 +42,8 @@ requirements: - numpy >1.19 - biopython - pandas + - openpyxl + - matplotlib test: imports: diff --git a/dbcan_cli/__init__.py b/dbcan/cli/__init__.py similarity index 100% rename from dbcan_cli/__init__.py rename to dbcan/cli/__init__.py diff --git a/dbcan/cli/cgc_process_json.py b/dbcan/cli/cgc_process_json.py new file mode 100644 index 000000000..0c2536ab2 --- /dev/null +++ b/dbcan/cli/cgc_process_json.py @@ -0,0 +1,276 @@ +########################## +# to generate json file for all cgc_stardard.out file from run_dbcan +# use: python cgc_process_json.py -i cgc_standard.out -o cgc_standard.out.json +# written by Roland Madadjim in Cui's lab at Soc, UNL +# last updated: 12/09/2022 +########################## + +# from __future__ import print_function +import argparse +import json +import os + +import numpy as np +import pandas as pd + + +class PrePro: + """ + A class for preprocessing and organizing genetic data. + + Attributes + ---------- + df (DataFrame): The initial data provided to the class. + + Args: + data (DataFrame): The data to be processed. + + """ + + def __init__(self, data): + self.df = data + + def extract_gs(self, dataList): + """ + Extracts gene strings from the provided data list. + + This method processes each entry in the dataList, extracting and concatenating specific gene-related information based on gene types like CAZyme, TC, TF, STP, and Null_Type. + + Args: + dataList (list): A list of dictionaries, each containing gene and protein information. + + Returns + ------- + str: A string representing concatenated gene information. + """ + i = 0 + geneL = [] + # gene = list(map(lambda e: "{}".format(e["Gene_Type"]), dataList)) + # pfam = list(map(lambda e: "{}".format(e["Protein_Family"]), dataList)) + gene = ["{}".format(e["Gene_Type"]) for e in dataList] + pfam = ["{}".format(e["Protein_Family"]) for e in dataList] + while i < len(dataList): + if gene[i] == "CAZyme": + s = pfam[i] + geneL.append(s) + elif gene[i] == "TC": + s = pfam[i] + geneL.append(s) + elif gene[i] == "TF": + s = pfam[i] + geneL.append(s) + elif gene[i] == "STP": + s = pfam[i] + geneL.append(s) + elif gene[i] == "Null_Type": + s = "NA" + geneL.append(s) + i = i + 1 + gene_st = "-".join(geneL) + return gene_st + + def pul_section(self): + """ + Processes the dataframe and yields structured genetic data. + + This method groups the dataframe by 'cgc_id' and processes each group to yield a dictionary containing detailed genetic information for each 'cgc_id'. + + Yields + ------ + dict: A dictionary containing genetic and protein information structured by 'cgc_id'. + """ + for (cgc_id), df_pul_grouped in self.df.groupby("cgc_id"): + datalist = list(self.cluster_section(df_pul_grouped)) + gene_str = self.extract_gs(datalist) + yield { + cgc_id: { + "changelog": [], + "Cluster_ID": cgc_id, + "Gene_String": gene_str, + "Contig_ID": self.df.loc[self.df["cgc_id"] == cgc_id, "contig_id"].iloc[0], + "ncbi_species_tax_id": [], + "organism_name": [], + "publication": [], + "Protein": list(self.cluster_section(df_pul_grouped)), + # "dbCan_Pul_accession": ID, # as string or integer? + # "publication": df.loc[df['ID'] == ID, 'PMID'].iloc[0], + } + } + + def cluster_section(self, df_pul_grouped): + """ + Yields structured data for each gene cluster. + + This method iterates over grouped dataframe and yields a dictionary containing detailed information about each gene cluster. + + Args: + df_pul_grouped (DataFrame): The grouped dataframe by specific gene identifiers. + + Yields + ------ + dict: A dictionary containing information about a gene cluster. + """ + for ( + contig_id, + gene_type, + protein_id, + gene_start, + gene_stop, + direction, + protein_family, + ), _ in df_pul_grouped.groupby( + [ + "contig_id", + "gene_type", + "protein_id", + "gene_start", + "gene_stop", + "direction", + "protein_family", + ] + ): + yield { + "contig_id": contig_id, + "protein_id": protein_id, + "Gene_Type": gene_type, + "Gene_Start": gene_start, + "Gene_Stop": gene_stop, + "Strand": direction, + "Protein_Family": protein_family, + } + + def run_dbCan_section(self, df_puls): + """ + Processes the dataframe and yields gene and protein information. + + This method iterates over each row of the dataframe and yields a dictionary containing gene and protein information. + Currently, this method is a stub and needs to be implemented. + + Args: + df_puls (DataFrame): The dataframe containing gene and protein data. + + Yields + ------ + dict: A dictionary containing gene and protein information. + """ + for _ in df_puls.itertuples(): + yield { + # "Gene_Type": row.gene_type, + # "Gene_Start": row.gene_start, + # "Gene_Stop": row.gene_stop, + # "Strand": row.direction, + # "Protein_Family": row.protein_family + } + + +def file_ext(choices, fname, parser): + """ + Validates the extension of a given file name against allowed choices. + + This function checks if the file extension of `fname` is among the specified `choices`. If not, it raises an error through the provided `parser`. + + Parameters + ---------- + choices (tuple): A tuple of allowed file extensions (without the dot). + fname (str): The file name to be checked. + parser (argparse.ArgumentParser): The argument parser to use for raising an error if the extension is not allowed. + + Returns + ------- + str: The validated file name if its extension is in `choices`. + + Raises + ------ + argparse.ArgumentError: If the file extension is not in the allowed `choices`. + """ + ext = os.path.splitext(fname)[1][1:] + if ext not in choices: + parser.error("File needs to be a .out or .csv") + return fname + + +class CustomEncoder(json.JSONEncoder): + """ + Custom JSON encoder for handling numpy data types. + + This encoder extends `json.JSONEncoder` to provide custom serialization for certain data types not natively supported by the default JSON encoder. Specifically, it converts `numpy.int64` types to strings. + + Methods + ------- + default(obj): Overrides the default method to provide custom serialization. + """ + + def default(self, obj): + """ + Provides custom serialization for certain data types. + + Parameters + ---------- + obj: The object to serialize. + + Returns + ------- + The serialized object, converting `numpy.int64` to string. For other types, it relies on the superclass implementation. + + Raises + ------ + TypeError: If the object is not a recognized type and cannot be serialized by the superclass. + """ + if isinstance(obj, np.int64): + return str(obj) + return super().default(obj) + + +def main(): + """ + Main function to compile JSON from a 'cgc_standard.out' file. + + This function parses command-line arguments to get input and output file paths, reads the input file, processes and renames columns, and writes the processed data to a JSON file using a custom JSON encoder. + + Side Effects: + - Reads a specified input file. + - Writes processed data to a JSON output file. + - Can terminate the script if the input file does not have the correct extension. + """ + parser = argparse.ArgumentParser(description="Compiling Json from cgc_standard.out") + parser.add_argument( + "-i", + required=True, + help="path to output file (cgc_standard.out) file", + type=lambda s: file_ext(("out", "csv"), s, parser), + ) + parser.add_argument("-o", "--output") + args = parser.parse_args() + + with open(args.i) as file: ### input files + data = pd.read_csv(file, sep="\t") + data.rename( + columns={ + "CGC#": "cgc_id", + "Gene Type": "gene_type", + "Contig ID": "contig_id", + "Protein ID": "protein_id", + "Gene Start": "gene_start", + "Gene Stop": "gene_stop", + "Direction": "direction", + "Protein Family": "protein_family", + }, + inplace=True, + ) + data["gene_type"].fillna("Null_Type", inplace=True) + data["protein_family"].fillna("0", inplace=True) + p = PrePro(data) + + pul_list = list(p.pul_section()) + pul_dict = {} + for sub_dict in pul_list: + pul_dict.update(sub_dict) + jsonPuls = json.dumps(pul_dict, indent=4, cls=CustomEncoder) + + with open(args.output, "w") as outfile: + # with open("Json"+time.strftime("%Y%m%d%H%M%S")+".json","w") as outfile: + outfile.write(jsonPuls) + + +if __name__ == "__main__": + main() diff --git a/dbcan/cli/hmmer_parser.py b/dbcan/cli/hmmer_parser.py new file mode 100644 index 000000000..c9d853ad7 --- /dev/null +++ b/dbcan/cli/hmmer_parser.py @@ -0,0 +1,86 @@ +#!/usr/bin/env python3 +########################################################## +# hmmsearch parser for dbCAN meta server +# +# Written by Tanner Yohe under the supervision +# of Dr. Yin in the YinLab at NIU. +# Updated by Haidong Yi to reformat the codes. +# Updated by Le Huang from tips the contributor WATSON Mick , +# Thank you! +# +# Modified by Alex Fraser to have a run() method that can be called and returns data for better integration with other +# scripts. This script also retains the ability to be called from shell and output to pipe redirection. +# +# INPUT +# python hmmer_parser.py [inputFile] [eval] [coverage] +# eval and coverage are optional, inputFile is required +# -updating info: +# -adds pid for every subprocess to make codes robust. +# Last updated: 1/10/19 +########################################################### + +import os +import sys +from subprocess import call + + +def run(input_file, eval_num=1e-15, coverage=0.35, verbose=False): + """ + Processes a given input file and returns filtered genomic data. + + This function reads an input file, performs various filtering and processing steps, and returns a string containing the processed data. It uses shell commands for data manipulation and filtering based on specified evaluation number and coverage values. + + Parameters + ---------- + input_file (str): The path to the input file containing genomic data. + eval_num (float, optional): The evaluation number threshold for filtering. Defaults to 1e-15. + coverage (float, optional): The coverage threshold for filtering. Defaults to 0.35. + verbose (bool, optional): If True, the function will print detailed processing information. Defaults to False. + + Returns + ------- + str: A string containing the processed data, where each line represents a row of filtered data based on the evaluation number and coverage criteria. + + Side Effects: + - Creates and deletes a temporary file named 'temp.'. + - Prints processing details to the standard output if verbose is True. + + Note: + This function relies on external shell commands and Perl script for data processing, and it assumes a specific format for the input data. + """ + tmpfile = "temp." + str(os.getpid()) + + call( + "cat " + + input_file + + " | grep -v '^#' | awk '{print $4,$6,$1,$3,$13,$16,$17,$18,$19}' | sed 's/ /\t/g' | sort -k 3,3 -k 8n -k 9n | perl -e 'while(<>){chomp;@a=split;next if $a[-1]==$a[-2];push(@{$b{$a[2]}},$_);}foreach(sort keys %b){@a=@{$b{$_}};for($i=0;$i<$#a;$i++){@b=split(/\t/,$a[$i]);@c=split(/\t/,$a[$i+1]);$len1=$b[-1]-$b[-2];$len2=$c[-1]-$c[-2];$len3=$b[-1]-$c[-2];if($len3>0 and ($len3/$len1>0.5 or $len3/$len2>0.5)){if($b[4]<$c[4]){splice(@a,$i+1,1);}else{splice(@a,$i,1);}$i=$i-1;}}foreach(@a){print $_.\"\n\";}}' > " + + tmpfile, + shell=True, + ) + + output = "" + with open(tmpfile) as f: + for line in f: + row = line.rstrip().split("\t") + row.append(float(int(row[6]) - int(row[5])) / int(row[1])) + if float(row[4]) <= float(eval_num) and float(row[-1]) >= float(coverage): + if verbose: + print("\t".join([str(x) for x in row])) + output += "\t".join([str(x) for x in row]) + "\n" + call(["rm", tmpfile]) + + return output + + +if __name__ == "__main__": + if len(sys.argv) > 3: + file = sys.argv[1] + eval_arg = float(sys.argv[2]) + coverage_arg = float(sys.argv[3]) + run(file, eval_arg, coverage_arg, verbose=True) + if len(sys.argv) > 1: + file = sys.argv[1] + run(file, verbose=True) + else: + print("Please give a hmmscan output file as the first command") + exit() diff --git a/dbcan/cli/hmmscan_parser.py b/dbcan/cli/hmmscan_parser.py new file mode 100644 index 000000000..d6aa3365b --- /dev/null +++ b/dbcan/cli/hmmscan_parser.py @@ -0,0 +1,94 @@ +#!/usr/bin/env python3 +########################################################## +# hmmscan parser for dbCAN meta server +# +# Based off the hmmscan parser used in the dbCAN server, +# written by Dr. Yin +# +# Written by Tanner Yohe under the supervision +# of Dr. Yin in the YinLab at NIU. +# +# Updated by Le Huang from tips the contributor WATSON Mick , +# Thank you! +# +# Modified by Alex Fraser to have a run() method that can be called and returns data for better integration with other +# scripts. This script also retains the ability to be called from shell and output to pipe redirection. +# This file had to be renamed from "hmmscan-parser.py" to "hmmscan_parser.py" because of python module import conventions. +# Modified on 07/06/22 +# +# INPUT +# python hmmscan-parser-dbCANmeta.py [inputFile] [eval] [coverage] +# eval and coverage are optional, inputFile is required +# -updating info: +# -adds pid for every subprocess to make codes robust. +# Last updated: 1/10/19 +########################################################### + +import os +import sys +from subprocess import call + + +def run(input_file, eval_num=1e-15, coverage=0.35, verbose=False): + """ + Executes a genomic data processing pipeline on a given input file. + + This function processes an input genomic file, applying various transformations and filtering criteria based on evaluation number and coverage. It first executes a series of shell commands to manipulate the data, then reads the processed data, applies additional filtering, and returns the result as a string. + + Parameters + ---------- + input_file (str): The path to the input file containing genomic data. + eval_num (float, optional): Evaluation number threshold for data filtering, defaults to 1e-15. + coverage (float, optional): Coverage threshold for data filtering, defaults to 0.35. + verbose (bool, optional): If True, outputs additional processing information, defaults to False. + + Returns + ------- + str: A string containing processed and filtered genomic data. Each line in the string represents a row of data that meets the specified evaluation number and coverage thresholds. + + Side Effects: + - Creates a temporary file named 'temp.' for intermediate processing. + - Deletes the temporary file after processing. + - Outputs processing details to the standard output if verbose is True. + + Notes + ----- + - The function relies on external shell commands and Perl for initial data processing. + - Assumes a specific format for the input data, including expected columns and data types. + """ + tmpfile = "temp." + str(os.getpid()) + + call( + "cat " + + input_file + + " | grep -v '^#' | awk '{print $1,$3,$4,$6,$13,$16,$17,$18,$19}' | sed 's/ /\t/g' | sort -k 3,3 -k 8n -k 9n | perl -e 'while(<>){chomp;@a=split;next if $a[-1]==$a[-2];push(@{$b{$a[2]}},$_);}foreach(sort keys %b){@a=@{$b{$_}};for($i=0;$i<$#a;$i++){@b=split(/\t/,$a[$i]);@c=split(/\t/,$a[$i+1]);$len1=$b[-1]-$b[-2];$len2=$c[-1]-$c[-2];$len3=$b[-1]-$c[-2];if($len3>0 and ($len3/$len1>0.5 or $len3/$len2>0.5)){if($b[4]<$c[4]){splice(@a,$i+1,1);}else{splice(@a,$i,1);}$i=$i-1;}}foreach(@a){print $_.\"\n\";}}' > " + + tmpfile, + shell=True, + ) + + output = "" + with open(tmpfile) as f: + for line in f: + row = line.rstrip().split("\t") + row.append(float(int(row[6]) - int(row[5])) / int(row[1])) + if float(row[4]) <= float(eval_num) and float(row[-1]) >= float(coverage): + if verbose: + print("\t".join([str(x) for x in row])) + output += "\t".join([str(x) for x in row]) + "\n" + call(["rm", tmpfile]) + + return output + + +if __name__ == "__main__": + if len(sys.argv) > 3: + file = sys.argv[1] + eval_arg = float(sys.argv[2]) + coverage_arg = float(sys.argv[3]) + run(file, eval_arg, coverage_arg, verbose=True) + if len(sys.argv) > 1: + file = sys.argv[1] + run(file, verbose=True) + else: + print("Please give a hmmscan output file as the first command") + exit() diff --git a/dbcan/cli/run_dbcan.py b/dbcan/cli/run_dbcan.py new file mode 100755 index 000000000..224c9c187 --- /dev/null +++ b/dbcan/cli/run_dbcan.py @@ -0,0 +1,1204 @@ +#!/usr/bin/env python3 +######################################################### +# dbCAN3 (Stand Alone Version) +# +# Written by Tanner Yohe in the Yin Lab at NIU +# Revised by Qiwei Ge in Yin Lab at UNL && Le Huang at NKU +# Updated by Le Huang at UNC, Mohamad Majd Raslan in the Yin Lab at NIU, Wei Li, Qiwei Ge in Dr.Yin's Lab at UNL +# Updated by Haidong Yi for reconstructing codes, Alex Fraser for adding functions. +# Updated by Jinfang Zheng in Yinlab at UNL, new function, substrate prediciton based on dbCAN-PUL and dbCAN-sub database. + +import argparse +import os +import time + +# Recent updated information: +# Jan/01/23: Add doc code +# Oct/10/23: Recontructed the run_dbcan [Haidong Yi] +# Sep/07/23: Replace hmmscan with hmmsearch. Update perl code [Le Huang, Yanbin Yin] +# Dec/15/22: 1.adding function to convert cgc_standard.out to json format. 2. adding function cgc_[Jinfang Zheng] +# Dec/06/22: fix gene ID in CGCfinder output file cgc.out[Jinfang Zheng] +# Nov/06/22: Using dbCAN_sub, eCAMI has been removed [Qiwei Ge] +# Jun/13/22: Allowing direct calls to main function from other scripts [Alex Fraser] +# Sep/29/22: Hotpep has been removed, added eCAMI tool. 2. cgc out reformatting. 3. Fixed multiple GT2s [Qiwei Ge] +# +# Accepts user input +# Predicts genes if needed +# Runs input against HMMER, DIAMOND, and dbCAN_sub +# Optionally predicts CGCs with CGCFinder +# Creats an overview table using output files from core +# tools from dbcan-sub.hmm.out,hmmer.out and diamond.out if they exist. +########################################################## +from subprocess import Popen, call + +from dbcan.cli import hmmer_parser +from dbcan.utils.cgc_substrate_prediction import cgc_substrate_prediction +from dbcan.utils.CGCFinder import cgc_finder, simplify_cgc_output + + + +def runHmmer(outPath, hmm_cpu, dbDir, hmm_eval, hmm_cov, db_name): + """ + Executes HMMER's hmmsearch tool to search sequence databases for sequence homologs. + + This function runs the hmmsearch command from the HMMER suite, directing the output to specified files. It then parses the hmmsearch output, writes the parsed output to a file, and cleans up intermediate files. + + Parameters + ---------- + outPath (str): The path where output files will be saved. + hmm_cpu (int): The number of CPUs to use for hmmsearch. + dbDir (str): The directory containing the HMM database files. + hmm_eval (float): The E-value threshold for considering hits as significant in hmmsearch. + hmm_cov (float): The coverage threshold for considering hits as significant. + db_name (str): The name of the database to use for hmmsearch. + + Side Effects: + - Creates and writes to files in the specified output path. + - Deletes intermediate output files after processing is complete. + """ + domtblout_file = f"{outPath}h{db_name}.out" + hmm_file = f"{dbDir}{db_name}.hmm" + uniInput_file = f"{outPath}uniInput" + + hmmer = Popen( + [ + "hmmsearch", + "--domtblout", + domtblout_file, + "--cpu", + str(hmm_cpu), + "-o", + "/dev/null", + hmm_file, + uniInput_file, + ] + ) + hmmer.wait() + parsed_hmm_output = hmmer_parser.run(input_file=f"{outPath}h{db_name}.out", eval_num=hmm_eval, coverage=hmm_cov) + with open(f"{outPath}{db_name}.out", "w") as f: + f.write(parsed_hmm_output) + if os.path.exists(domtblout_file): + call(["rm", domtblout_file]) + + +def split_uniInput(uniInput, dbcan_thread, outPath, dbDir, hmm_eval, hmm_cov, hmm_cpu, dbcan_offset): + """ + Splits a large input file into smaller files and runs HMMER hmmsearch on each part. + + This function is designed to handle large genomic data files. It splits the input file into smaller parts based on the specified offset and then runs hmmsearch on each part in parallel. The function manages file creation and deletion, ensuring that temporary files are removed after processing. + + Parameters + ---------- + uniInput (str): Path to the input file containing genomic data. + dbcan_thread (int): Number of threads to use for parallel processing. + outPath (str): Path where output files and temporary files will be stored. + dbDir (str): Directory containing the HMM database files. + hmm_eval (float): E-value threshold for considering hits as significant in hmmsearch. + hmm_cov (float): Coverage threshold for considering hits as significant. + dbcan_offset (float): Factor to determine the size of each split file based on the input file size. + + Side Effects: + - Creates multiple temporary files during processing. + - Prints the count of identifiers and processing time to the standard output. + - Deletes all temporary files created during the process. + + Notes + ----- + - The function assumes a specific format for the input data. + - Time taken for execution is printed at the end of the function's run. + """ + ticks = time.time() + file = open(uniInput) + uniInput_file = file.readlines() + file.close() + signal_count = 0 + min_files = dbcan_thread + file_number = None + split_files = [] + fsize = int(os.path.getsize(uniInput) / float(1024 * 1024) * dbcan_offset) + + if fsize < 1: + fsize = 1 + + for line in uniInput_file: + if ">" in line: + signal_count += 1 + print(f"Count of proteins: {signal_count}") + + if signal_count >= min_files: + for i in range(fsize): + with open(f"{outPath}{i}.txt", "w") as f: + pass + split_files.append(f"{i}.txt") + for i in range(len(uniInput_file)): + if ">" in uniInput_file[i]: + file_number = i % fsize + with open(f"{outPath}{file_number}.txt", "a") as f: + f.write(uniInput_file[i]) + + ths = [] + for j in split_files: + ths.append( + Popen( + [ + "hmmsearch", + "--domtblout", + f"{outPath}d{j}", + "--cpu", + str(hmm_cpu), + "-o", + "/dev/null", + f"{dbDir}dbCAN_sub.hmm", + f"{outPath}{j}", + ] + ) + ) + for th in ths: + th.wait() + for m in split_files: + hmm_parser_output = hmmer_parser.run(f"{outPath}d{m}", eval_num=hmm_eval, coverage=hmm_cov) + with open(f"{outPath}temp_{m}", "w") as temp_hmmer_file: + temp_hmmer_file.write(hmm_parser_output) + os.remove(f"{outPath}d{m}") + os.remove(f"{outPath}{m}") + with open(f"{outPath}dtemp.out", "w"): pass + for n in split_files: + with open(f"{outPath}temp_{n}") as file_read: + files_lines = file_read.readlines() + os.remove(f"{outPath}temp_{n}") + with open(f"{outPath}dtemp.out", "a") as f: + f.writelines(files_lines) + else: + dbsub = Popen( + [ + "hmmsearch", + "--domtblout", + f"{outPath}d.txt", + "--cpu", + str(hmm_cpu), + "-o", + "/dev/null", + f"{dbDir}dbCAN_sub.hmm", + f"{outPath}uniInput", + ] + ) + dbsub.wait() + + hmm_parser_output = hmmer_parser.run(f"{outPath}d.txt", eval_num=hmm_eval, coverage=hmm_cov) + with open(f"{outPath}dtemp.out", "w") as temp_hmmer_file: + temp_hmmer_file.write(hmm_parser_output) + + print("total time:", time.time() - ticks) + + +def run_dbCAN( + inputFile, + inputType, + cluster=None, + dbCANFile="dbCAN.txt", + dia_eval=1e-102, + dia_cpu=4, + hmm_eval=1e-15, + hmm_cov=0.35, + hmm_cpu=4, + dbcan_thread=5, + dbcan_offset=2, + tf_eval=1e-4, + tf_cov=0.35, + tf_cpu=1, + stp_eval=1e-4, + stp_cov=0.3, + stp_cpu=1, + prefix="", + outDir="output", + dbDir="db", + cgc_dis=2, + cgc_sig_genes="tp", + tool_arg="all", + use_signalP=False, + signalP_path="signalp", + gram="all", +): + """ + Executes the dbCAN tool for analyzing genomic sequences. + + This function runs the dbCAN tool, which involves several steps: setting up paths and directories, gene prediction, running SignalP, core analysis tools (DIAMOND, HMMER, dbCAN_sub), and parsing the results. The function also combines results from different analysis tools and prepares a summary overview. + + Parameters + ---------- + inputFile (str): Path to the input file in FASTA format. + inputType (str): Type of sequence input (protein, prok, meta). + cluster (str, optional): Path to the cluster file if available. + dbCANFile (str): Filename of the HMM database. + dia_eval (float): E-value threshold for DIAMOND. + dia_cpu, hmm_cpu, dbcan_thread, tf_cpu, stp_cpu (int): CPU cores for respective tools. + hmm_eval (float): E-value threshold for HMMER. + hmm_cov, tf_cov, stp_cov (float): Coverage values for HMMER and related tools. + dbcan_offset (int): Offset parameter for dbCAN. + prefix (str): Prefix for output files. + outDir (str): Directory for output files. + dbDir (str): Directory containing the databases. + cgc_dis (int): Distance parameter for CGC Finder. + cgc_sig_genes (str): Signature genes for CGC Finder. + tool_arg (str): Tools to run ('all', 'diamond', 'hmmer', etc.). + use_signalP (bool): Whether to use SignalP tool. + signalP_path (str): Path to the SignalP tool. + gram (str): Gram type for proteome/prokaryote nucleotide (p, n, all). + + """ + # Begin Setup and Input Checks + if not dbDir.endswith("/") and len(dbDir) > 0: + dbDir += "/" + + if not outDir.endswith("/") and len(outDir) > 0: + outDir += "/" + + outPath = outDir + prefix + auxFile = "" + + find_clusters = False + if cluster is not None: + find_clusters = True + if inputType == "protein": + auxFile = cluster + else: + auxFile = "%sprodigal.gff" % outPath + + if not os.path.isdir(dbDir): + print(dbDir, "ERROR: The database directory does not exist") + exit() + + if not os.path.isfile(os.path.join(dbDir, "CAZy.dmnd")): + print( + "ERROR: No CAZy DIAMOND database found. \ + Please make sure that your CAZy DIAMOND databased is named 'CAZy.dmnd' and is located in your database directory" + ) + exit() + + if not os.path.isfile(os.path.join(dbDir, dbCANFile)): + print( + "ERROR: No dbCAN HMM database found. \ + Please make sure that your dbCAN HMM database is named 'dbCAN-HMMdb-V11.txt' or the newest one, has been through hmmpress, and is located in your database directory" + ) + exit() + + if not os.path.isfile(os.path.join(dbDir, "dbCAN_sub.hmm")): + print( + "ERROR: No dbCAN_sub HMM database found. \ + Please make sure that your dbCAN_sub HMM databased is named 'dbCAN_sub.hmm' or has been through hmmpress, and is located in your database directory" + ) + exit() + + if not os.path.isdir(outDir): + call(["mkdir", outDir]) + + if find_clusters and inputType == "protein": + if len(auxFile) > 0: + print(auxFile) + if not os.path.isfile(auxFile): + print("ERROR: It seems that the auxillary filename that you provided does not exist, or is not a file") + exit() + else: + print( + "ERROR: Please provide an auxillary input file with the position of each gene. This file can either be in BED or GFF format" + ) + exit() + + tools = [True, True, True] # DIAMOND, HMMER, dbCAN_sub + if "all" not in tool_arg: + if "diamond" not in tool_arg: + tools[0] = False + if "hmmer" not in tool_arg: + tools[1] = False + if "dbcansub" not in tool_arg: + tools[2] = False + + # End Setup and Input Checks + ######################### + ######################### + # Begin Gene Prediction Tools + if inputType == "prok": + call( + [ + "prodigal", + "-i", + inputFile, + "-a", + "%suniInput" % outPath, + "-o", + "%sprodigal.gff" % outPath, + "-f", + "gff", + "-q", + ] + ) + if inputType == "meta": + call( + [ + "prodigal", + "-i", + inputFile, + "-a", + "%suniInput" % outPath, + "-o", + "%sprodigal.gff" % outPath, + "-f", + "gff", + "-p", + "meta", + "-q", + ] + ) + # Proteome + if inputType == "protein": + call(["cp", inputFile, "%suniInput" % outPath]) + + # End Gene Prediction Tools + ####################### + # Begin SignalP + if use_signalP: + print("\n\n***************************0. SIGNALP start*************************************************\n\n") + if gram == "p" or gram == "all": + signalpos = Popen(f"{signalP_path} -t gram+ {outPath}uniInput > {outPath}signalp.pos", shell=True) + if gram == "n" or gram == "all": + signalpneg = Popen(f"{signalP_path} -t gram- {outPath}uniInput > {outPath}signalp.neg", shell=True) + if gram == "euk" or gram == "all": + signalpeuk = Popen(f"{signalP_path} -t euk {outPath}uniInput > {outPath}signalp.euk", shell=True) + + # End SignalP + ####################### + # Begin Core Tools + + if tools[0]: ### run diamond + print("\n\n***************************1. DIAMOND start*************************************************\n\n") + os.system( + "diamond blastp -d %s -e %s -q %suniInput -k 1 -p %d -o %sdiamond.out -f 6" + % (os.path.join(dbDir, "CAZy"), str(dia_eval), outPath, dia_cpu, outPath) + ) + print("\n\n***************************1. DIAMOND end***************************************************\n\n") + + if tools[1]: ### run hmmsearch (hmmer) + print("\n\n***************************2. HMMER start*************************************************\n\n") + os.system( + f"hmmsearch --domtblout {outPath}h.out --cpu {hmm_cpu} -o /dev/null {os.path.join(dbDir, dbCANFile)} {outPath}uniInput " + ) + print("\n\n***************************2. HMMER end***************************************************\n\n") + + hmm_parser_output = hmmer_parser.run(f"{outPath}h.out", eval_num=hmm_eval, coverage=hmm_cov) + with open(f"{outPath}hmmer.out", "w") as hmmer_file: + hmmer_file.write(hmm_parser_output) + # could clean this up and manipulate hmm_parser_output data directly instead of passing it into a temp file + with open(f"{outPath}hmmer.out", "r+") as f: + text = f.read() + f.close() + call(["rm", f"{outPath}hmmer.out"]) + text = text.split("\n") + if "" in text: + text.remove("") + for i in range(len(text)): + if "GT2_" in text[i]: + profile = text[i].split("\t")[0].split(".")[0] + text[i] = text[i].replace(profile, "GT2") + with open(f"{outPath}hmmer.out", "a") as f: + f.write(text[i] + "\n") + f.close() + if os.path.exists(f"{outPath}h.out"): + call(["rm", f"{outPath}h.out"]) + + if tools[2]: + print( + "\n\n***************************3. dbCAN_sub start***************************************************\n\n" + ) + split_uniInput("%suniInput" % outPath, dbcan_thread, outPath, dbDir, hmm_eval, hmm_cov, hmm_cpu, dbcan_offset) + print("\n\n***************************3. dbCAN_sub end***************************************************\n\n") + + # Process dtemp.out and create dbcan-sub.hmm.out + with open(f"{outPath}dtemp.out") as f: + with open("%sdbcan-sub.hmm.out" % outPath, "w") as out: + processed_lines = [] + for line in f: + row = line.rstrip().split("\t") + row.append(float(int(row[6]) - int(row[5])) / int(row[1])) + if float(row[4]) <= 1e-15 and float(row[-1]) >= 0.35: + # out.write("\t".join([str(x) for x in row]) + "\n") + processed_lines.append("\t".join([str(x) for x in row])) + # Process dbcan-sub.hmm.out content + updated_lines = [line for line in processed_lines if line.strip()] + + for i in range(len(updated_lines)): + if "GT2_" in updated_lines[i]: + profile = updated_lines[i].split("\t")[0].split(".")[0] + updated_lines[i] = updated_lines[i].replace(profile, "GT2") + if updated_lines: + with open(f"{outPath}dbcan-sub.hmm.out", "w") as f2: + f2.write("\n".join(updated_lines)) + + + # End Core Tools + ######################## + # Begin Parse Results + + # parse dbCAN_sub result + if tools[2]: + subs_dict = {} + with open(f"{dbDir}fam-substrate-mapping.tsv") as f: + next(f) + for line in f: + r = line.split("\t") + key = (r[2], "-") if len(r[4]) == 1 else (r[2], r[4].strip()) + subs_dict[key] = r[0] + # Process dbcan-sub.hmm.out if it exists + if os.path.exists(f"{outPath}dbcan-sub.hmm.out"): + if os.path.exists(f"{outPath}temp"): + os.system(f"rm {outPath}temp") + with open(f"{outPath}dbcan-sub.hmm.out") as f, open(f"{outPath}temp", "w") as out: + out.write( + "dbCAN subfam\tSubfam Composition\tSubfam EC\tSubstrate\tProfile Length\tGene ID\tGene Length\tE Value\tProfile Start\tProfile End\tGene Start\tGene End\tCoverage\n" + ) + + for line in f: + profile = line.split("\t") + subfam = [] + sub_composition = [] + sub_ec = [] + newline = [] + substrate = [] + key1 = "-" + key2 = ["-"] + + for p in profile[0].split("|"): + if ".hmm" in p: + subfam.append(p.split(".")[0]) + key1 = p.split(".")[0].split("_")[0] + elif len(p.split(".")) == 4: + sub_ec.append(p) + key2.append(p.split(":")[0]) + else: + sub_composition.append(p) + + for i in range(len(key2)): + try: + substrate.append(subs_dict[key1, key2[i]]) + except KeyError as e: + print("No substrate for it", e) + + subfam = "|".join(subfam) + + if sub_composition: + sub_composition = "|".join(sub_composition) + else: + sub_composition = "-" + + if sub_ec: + sub_ec = "|".join(sub_ec) + else: + sub_ec = "-" + + if substrate: + substrate = ", ".join(substrate) + else: + substrate = "-" + + rest = "\t".join(profile[1:]) + + newline = subfam + "\t" + sub_composition + "\t" + sub_ec + "\t" + substrate + "\t" + rest + out.write(newline) + call(["mv", f"{outPath}temp", f"{outPath}dbcan-sub.hmm.out"]) + else: + print(f"File not found: {outPath}dbcan-sub.hmm.out") + + + # parse hmmer result + if tools[1]: + try: + with open(outDir + prefix + "hmmer.out") as f: + with open(outDir + prefix + "temp", "w") as out: + out.write( + "HMM Profile\tProfile Length\tGene ID\tGene Length\tE Value\tProfile Start\tProfile End\tGene Start\tGene End\tCoverage\n" + ) + for line in f: + out.write(line) + call(["mv", outDir + prefix + "temp", outDir + prefix + "hmmer.out"]) + except FileNotFoundError as e: + print(f"File not found: {e}") + # Optionally, create the file if it doesn't exist + with open(outDir + prefix + "temp", "w") as out: + out.write( + "HMM Profile\tProfile Length\tGene ID\tGene Length\tE Value\tProfile Start\tProfile End\tGene Start\tGene End\tCoverage\n" + ) + call(["mv", outDir + prefix + "temp", outDir + prefix + "hmmer.out"]) + except OSError as e: + print(f"IO error occurred: {e}") + + # parse diamond result + if tools[0]: + with open(outDir + prefix + "diamond.out") as f: + with open(outDir + prefix + "temp", "w") as out: + out.write( + "Gene ID\tCAZy ID\t% Identical\tLength\tMismatches\tGap Open\tGene Start\tGene End\tCAZy Start\tCAZy End\tE Value\tBit Score\n" + ) + for line in f: + out.write(line) + call(["mv", outDir + prefix + "temp", outDir + prefix + "diamond.out"]) + + # End Parse Results + ######################## + # Begin CGCFinder + + if find_clusters: ### run cgc_finder or not + print("*****************************CGC-Finder start************************************") + + ######################## + # Begin TF,TP, STP prediction + """ + tf hmmer + """ + runHmmer(outPath, str(tf_cpu), dbDir, str(tf_eval), str(tf_cov), "tf-1") + runHmmer(outPath, str(tf_cpu), dbDir, str(tf_eval), str(tf_cov), "tf-2") + """ + stp hmmer + """ + runHmmer(outPath, str(stp_cpu), dbDir, str(stp_eval), str(stp_cov), "stp") + + """ + tp diamond + """ + call( + [ + "diamond", + "blastp", + "-d", + dbDir + "tcdb.dmnd", + "-e", + "1e-10", + "-q", + "%suniInput" % outPath, + "-k", + "1", + "-p", + "1", + "-o", + outPath + "tp.out", + "-f", + "6", + ] + ) + + # Initialize set and dictionaries for TF, STP, TP, + tf, stp, tp = set(), set(), set() + tf_genes, stp_genes, tp_genes = {}, {}, {} + + with open("%stf-1.out" % outPath) as f: + for line in f: + row = line.rstrip().split("\t") + tf.add(row[2]) + row[0] = "DBD-Pfam|" + row[0] + if row[2] not in tf_genes: + tf_genes[row[2]] = row[0] + else: + tf_genes[row[2]] += "," + row[0] + + with open("%stf-2.out" % outPath) as f: + for line in f: + row = line.rstrip().split("\t") + tf.add(row[2]) + row[0] = "DBD-SUPERFAMILY|" + row[0] + if row[2] not in tf_genes: + tf_genes[row[2]] = row[0] + else: + tf_genes[row[2]] += "," + row[0] + + with open("%sstp.out" % outPath) as f: + for line in f: + row = line.rstrip().split("\t") + stp.add(row[2]) + row[0] = "STP|" + row[0] + if row[2] not in stp_genes: + stp_genes[row[2]] = row[0] + else: + stp_genes[row[2]] += "," + row[0] + + with open(outDir + prefix + "tp.out") as f: + for line in f: + row = line.rstrip().split("\t") + tp.add(row[0]) + if row[0] not in tp_genes: + tp_genes[row[0]] = row[1] + else: + tp_genes[row[0]] += "," + row[1] + + + # End TF and TP prediction + ########################## + # Begine CAZyme Extraction + + #define dictionary for cazyme_genes: key -> gene; val -> fams + cazyme_genes = {} + + dia = set() + hmm = set() + dbs = set() + + if tools[0]: # Deal with diamond result + with open(outDir + prefix + "diamond.out") as f: + next(f) # Skip the first line + for line in f: + row = line.rstrip().split("\t") + gene_key = row[0] + fam_vals = row[1].strip("|").split("|")[1:] + dia.add(gene_key) + cazyme_genes.setdefault(gene_key, set()).update(fam_vals) + + if tools[1]: # Deal with hmmer result + with open(outDir + prefix + "hmmer.out") as f: + next(f) # Skip the first line + for line in f: + row = line.rstrip().split("\t") + gene_key = row[2] + fam_val = row[0].split(".hmm")[0] + hmm.add(gene_key) + cazyme_genes.setdefault(gene_key, set()).add(fam_val) + + if tools[2]: ### deal with dbcan_sub result + with open(outDir + prefix + "dbcan-sub.hmm.out") as f: + next(f) + for line in f: + row = line.rstrip().split("\t") + dbs.add(row[5]) + cazyme_genes.setdefault(row[5], set()).add(row[0]) + cazyme_genes = {key: '|'.join(values) for key, values in cazyme_genes.items()} + if tools.count(True) > 1: + temp1 = hmm.intersection(dbs) + temp2 = hmm.intersection(dia) + temp3 = dia.intersection(dbs) + cazyme = temp1.union(temp2, temp3) + else: + cazyme = hmm.union(dia, dbs) + # End CAZyme Extraction + ###################### + # Begin GFF preperation + + if inputType in ["prok", "meta"]: # use Prodigal GFF output + with open(outDir + prefix + "prodigal.gff") as f: + with open(outDir + prefix + "cgc.gff", "w") as out: + for line in f: + if not line.startswith("#"): + row = line.rstrip().rstrip(";").split("\t") + num = row[-1].split(";")[0].split("_")[-1] + gene = row[0] + "_" + num + row[8] = "" + if gene in cazyme: + row[2] = "CAZyme" + row[8] = "DB=" + cazyme_genes[gene] + elif gene in tf: + row[2] = "TF" + row[8] = "DB=" + tf_genes[gene] + elif gene in tp: + row[2] = "TC" + row[8] = "DB=" + tp_genes[gene] + elif gene in stp: + row[2] = "STP" + row[8] = "DB=" + stp_genes[gene] + row[8] += ";ID=" + gene + out.write("\t".join(row) + "\n") + else: # user provided GFF/BED file + gff = False + with open(auxFile) as f: + for line in f: + if not line.startswith("#"): + if len(line.split("\t")) == 9: + gff = True + break + if gff: # user file was in GFF format + with open(auxFile) as f: + with open(outDir + prefix + "cgc.gff", "w") as out: + for line in f: + if not line.startswith("#"): + row = line.rstrip().split("\t") + if row[2] == "CDS": + note = row[8].strip().rstrip(";").split(";") + gene = "" + notes = {} + for x in note: + temp = x.split("=") + notes[temp[0]] = temp[1] + if "ID" in notes: + gene = notes["ID"] + else: + continue + if gene in cazyme: + row[2] = "CAZyme" + row[8] = "DB=" + cazyme_genes[gene] + elif gene in tf: + row[2] = "TF" + row[8] = "DB=" + tf_genes[gene] + elif gene in tp: + row[2] = "TC" + row[8] = "DB=" + tp_genes[gene] + elif gene in stp: + row[2] = "STP" + row[8] = "DB=" + stp_genes[gene] + else: + row[8] = "" + row[8] += ";ID=" + gene + out.write("\t".join(row) + "\n") + else: # user file was in BED format + with open(auxFile) as f: + with open(outDir + prefix + "cgc.gff", "w") as out: + for line in f: + if line.startswith("track"): + continue + row = line.rstrip().rstrip(";").split("\t") + outrow = ["."] * 8 + [""] + gene = row[1] + if gene in cazyme: + outrow[2] = "CAZyme" + outrow[8] = "DB=" + cazyme_genes[gene] + elif gene in tf: + outrow[2] = "TF" + outrow[8] = "DB=" + tf_genes[gene] + elif gene in tp: + outrow[2] = "TC" + outrow[8] = "DB=" + tp_genes[gene] + elif gene in stp: + outrow[2] = "STP" + outrow[8] = "DB=" + stp_genes[gene] + else: + outrow[2] = "CDS" + outrow[0] = row[0] + outrow[3] = row[2] + outrow[4] = row[3] + outrow[6] = row[4] + outrow[8] += ";ID=" + gene + out.write("\t".join(outrow) + "\n") + # End GFF + #################### + # Begin CGCFinder call + print("**************************************CGC-Finder start***********************************************") + cgc_finder(outDir + prefix + "cgc.gff", cgc_dis, cgc_sig_genes, outDir + prefix + "cgc.out") + simplify_cgc_output(outDir + prefix + "cgc.out") + print("**************************************CGC-Finder end***********************************************") + # End CGCFinder call + # End CGCFinder + #################### + # Begin SignalP combination + if use_signalP: ### signalP + print("Waiting on signalP") + with open(outDir + prefix + "temp", "w") as out: + if gram == "all" or gram == "p": + signalpos.wait() + print("SignalP pos complete") + + with open(outDir + prefix + "signalp.pos") as f: + for line in f: + if not line.startswith("#"): + row = line.split(" ") + row = [x for x in row if x != ""] + if row[9] == "Y": + out.write(line) + call(["rm", outDir + prefix + "signalp.pos"]) + if gram == "all" or gram == "n": + signalpneg.wait() + print("SignalP neg complete") + with open(outDir + prefix + "signalp.neg") as f: + for line in f: + if not line.startswith("#"): + row = line.split(" ") + row = [x for x in row if x != ""] + if row[9] == "Y": + out.write(line) + call(["rm", outDir + prefix + "signalp.neg"]) + if gram == "all" or gram == "euk": + signalpeuk.wait() + print("SignalP euk complete") + with open(outDir + prefix + "signalp.euk") as f: + for line in f: + if not line.startswith("#"): + row = line.split(" ") + row = [x for x in row if x != ""] + if row[9] == "Y": + out.write(line) + call(["rm", outDir + prefix + "signalp.euk"]) + call("sort -u " + outDir + prefix + "temp > " + outDir + prefix + "signalp.out", shell=True) + call(["rm", outDir + prefix + "temp"]) + + # End SignalP combination + ####################### + ####################### + # start Overview + print("Preparing overview table from hmmer, dbCAN_sub and diamond output...") + workdir = outDir + prefix + + # a function to remove duplicates from lists while keeping original order + def unique(seq): + exists = set() + return [x for x in seq if not (x in exists or exists.add(x))] + + arr_dbsub = None + arr_hmmer = None + + # check if files exist. if so, read files and get the gene numbers + if tools[0]: + arr_diamond = open(workdir + "diamond.out").readlines() + diamond_genes = [arr_diamond[i].split()[0] for i in range(1, len(arr_diamond))] # or diamond_genes = [] + + if tools[1]: + arr_hmmer = open(workdir + "hmmer.out").readlines() + hmmer_genes = [arr_hmmer[i].split()[2] for i in range(1, len(arr_hmmer))] # or hmmer_genes = [] + + if tools[2]: + arr_dbsub = open(workdir + "dbcan-sub.hmm.out").readlines() + dbsub_genes = [arr_dbsub[i].split("\t")[5] for i in range(1, len(arr_dbsub))] # or dbsub_genes = [] + + if use_signalP and (os.path.exists(workdir + "signalp.out")): + arr_sigp = open(workdir + "signalp.out").readlines() + sigp_genes = {} + for i in range(0, len(arr_sigp)): + row = arr_sigp[i].split() + sigp_genes[row[0]] = row[4] # previous one is row[2], use Y-score instead from suggestion of Dongyao Li + + # remove duplicates from input lists + if not tools[0]: + diamond_genes = [] + if not tools[1]: + hmmer_genes = [] + if not tools[2]: + dbsub_genes = [] + + if len(dbsub_genes) > 0: + if dbsub_genes[-1] is None: + dbsub_genes.pop() + dbsub_genes = unique(dbsub_genes) + if "hmmer_genes" in locals(): + hmmer_genes.pop() + hmmer_genes = unique(hmmer_genes) + if "diamond_genes" in locals(): + diamond_genes.pop() + diamond_genes = unique(diamond_genes) + + # parse input, stroe needed variables + if tools[0] and (len(arr_diamond) > 1): + diamond_fams = {} + for i in range(1, len(arr_diamond)): + row = arr_diamond[i].split("\t") + fam = row[1].strip("|").split("|") + diamond_fams[row[0]] = fam[1:] + + if tools[1] and (len(arr_hmmer) > 1): + hmmer_fams = {} + for i in range(1, len(arr_hmmer)): + row = arr_hmmer[i].split("\t") + fam = row[0].split(".") + fam = fam[0] + "(" + row[7] + "-" + row[8] + ")" + if row[2] not in hmmer_fams: + hmmer_fams[row[2]] = [] + hmmer_fams[row[2]].append(fam) + + if tools[2] and (len(arr_dbsub) > 1): + dbsub_fams = {} + for i in range(1, len(arr_dbsub)): + row_ori = arr_dbsub[i].split("\t") + fams_ID = row_ori[5] + if fams_ID not in dbsub_fams: + dbsub_fams[fams_ID] = {} + dbsub_fams[fams_ID]["fam_name"] = [] + dbsub_fams[fams_ID]["ec_num"] = [] + + dbsub_fams[fams_ID]["fam_name"].append(row_ori[0]) + dbsub_fams[fams_ID]["ec_num"].append(row_ori[2]) + + # overall table + + all_genes = unique(hmmer_genes + dbsub_genes + diamond_genes) + + with open(workdir + "overview.txt", "w+") as fp: + if use_signalP: + fp.write("Gene ID\tEC#\tHMMER\tdbCAN_sub\tDIAMOND\tSignalp\t#ofTools\n") + else: + fp.write("Gene ID\tEC#\tHMMER\tdbCAN_sub\tDIAMOND\t#ofTools\n") + for gene in all_genes: + csv = [gene] + num_tools = 0 + + if tools[2] and arr_dbsub is not None and (gene in dbsub_genes): + if dbsub_fams[gene]["ec_num"] == []: + csv.append("-") + else: + csv.append("|".join(dbsub_fams[gene]["ec_num"])) + else: + csv.append("-") + + if tools[1] and arr_hmmer is not None and (gene in hmmer_genes): + num_tools += 1 + csv.append("+".join(hmmer_fams[gene])) + else: + csv.append("-") + + if tools[2] and arr_dbsub is not None and (gene in dbsub_genes): + num_tools += 1 + csv.append("+".join(dbsub_fams[gene]["fam_name"])) + else: + csv.append("-") + + if tools[0] and arr_diamond is not None and (gene in diamond_genes): + num_tools += 1 + csv.append("+".join(diamond_fams[gene])) + else: + csv.append("-") + if use_signalP: + if gene in sigp_genes: + csv.append("Y(1-" + sigp_genes[gene] + ")") + else: + csv.append("N") + csv.append(str(num_tools)) + temp = "\t".join(csv) + "\n" + fp.write(temp) + print("overview table complete. Saved as " + workdir + "overview.txt") + # End overview + + +def rundbCAN_parser(): + """ + Sets up and returns an argument parser for the dbCAN3 Command Line Tool. + + This function configures an `argparse.ArgumentParser` for parsing command-line arguments necessary to run the dbCAN3 tool. It defines the expected input file, input type, and various optional parameters for running dbCAN3, including database files, DIAMOND and HMMER settings, output preferences, and additional tools and parameters for CGC finder and substrate prediction. + + Returns + ------- + argparse.ArgumentParser: Configured argument parser for dbCAN3 command-line tool. + + The parser includes arguments for: + - inputFile (str): Path to the input file in FASTA format. + - inputType (str): Type of sequence input (protein, prok, meta). + - dbCANFile (str): Filename of the HMM database. + - Various parameters for DIAMOND, HMMER, and other tools. + - Output settings, database directory, and tool selections. + - Parameters specific to CGC finder and substrate prediction modules. + """ + + usage = ''' + Example usages of run_dbcan: + 1. CAZyme annotation with isolated genome sequence as input: + run_dbcan EscheriaColiK12MG1655.fna prok + + 2. CAZyme annotation with protein sequence as input: + run_dbcan EscheriaColiK12MG1655.faa protein + + 3. CAZyme annotation with meta genome as input: + run_dbcan EscheriaColiK12MG1655.fna meta + + 4. CAZyme and CGC annotation with meta genome as input: + run_dbcan EscheriaColiK12MG1655.fna meta -c EscheriaColiK12MG1655.gff + + 5. CGC substrate prediction with the results from #4: + run_dbcan EscheriaColiK12MG1655.fna meta -c EscheriaColiK12MG1655.gff --cgc_substrate --only_sub + + 6. CAZyme, CGC annotation and substrate prediction with mete genome as input: + run_dbcan EscheriaColiK12MG1655.fna meta -c EscheriaColiK12MG1655.gff --cgc_substrate + + 7. CAZyme, CGC annotation and substrate prediction with protein and gff as input: + run_dbcan EscheriaColiK12MG1655.faa protein -c EscheriaColiK12MG1655.gff --cgc_substrate + + 8. CAZyme, CGC annotation and substrate prediction using hmmer against dbCAN-HMM datbase with protein and gff as input: + run_dbcan EscheriaColiK12MG1655.faa protein -c EscheriaColiK12MG1655.gff --cgc_substrate --tools hmmer + + + More usage can be found in our https://dbcan.readthedocs.io/en/latest. + ''' + + parser = argparse.ArgumentParser(description="dbCAN: Automatic CAZyme Annotation", prog='run_dbcan', usage=usage) + parser.add_argument("inputFile", help="User input file. Must be in FASTA format.") + parser.add_argument( + "inputType", + choices=["protein", "prok", "meta"], # protein=proteome, prok=prokaryote nucleotide, meta=metagenome nucleotide + help="Type of sequence input. protein=proteome; prok=prokaryote; meta=metagenome", + ) + parser.add_argument( + "--dbCANFile", + default="dbCAN.txt", + help="Indicate the file name of HMM database such as dbCAN.txt, please use the newest one from dbCAN2 website.", + ) + parser.add_argument("--dia_eval", default=1e-102, type=float, help="DIAMOND E Value") + parser.add_argument("--dia_cpu", default=8, type=int, help="Number of CPU cores that DIAMOND is allowed to use") + parser.add_argument("--hmm_eval", default=1e-15, type=float, help="HMMER E Value") + parser.add_argument("--hmm_cov", default=0.35, type=float, help="HMMER Coverage val") + parser.add_argument("--hmm_cpu", default=8, type=int, help="Number of CPU cores that HMMER is allowed to use") + parser.add_argument("--out_pre", default="", help="Output files prefix") + parser.add_argument("--out_dir", default="output", help="Output directory") + parser.add_argument("--db_dir", default="db", help="Database directory") + parser.add_argument( + "--tools", + "-t", + nargs="+", + choices=["hmmer", "diamond", "dbcansub", "all"], + default="all", + help="Choose a combination of tools to run", + ) + parser.add_argument( + "--use_signalP", + default=False, + type=bool, + help="Use signalP or not, remember, you need to setup signalP tool first. Because of signalP license, Docker version does not have signalP.", + ) + parser.add_argument( + "--signalP_path", "-sp", default="signalp", type=str, help="The path for signalp. Default location is signalp" + ) + parser.add_argument( + "--gram", + "-g", + choices=["p", "n", "all"], + default="all", + help="Choose gram+(p) or gram-(n) for proteome/prokaryote nucleotide, which are params of SingalP, only if user use singalP", + ) + parser.add_argument("-v", "--version", default="4.1.0", type=str) + # dbCAN-sub + dbCAN_sub_group = parser.add_argument_group("dbCAN-sub parameters") + dbCAN_sub_group.add_argument("--dbcan_thread", "-dt", default=12, type=int) + dbCAN_sub_group.add_argument("--tf_eval", default=1e-4, type=float, help="tf.hmm HMMER E Value") + dbCAN_sub_group.add_argument("--tf_cov", default=0.35, type=float, help="tf.hmm HMMER Coverage val") + dbCAN_sub_group.add_argument( + "--tf_cpu", default=8, type=int, help="tf.hmm Number of CPU cores that HMMER is allowed to use" + ) + dbCAN_sub_group.add_argument("--stp_eval", default=1e-4, type=float, help="stp.hmm HMMER E Value") + dbCAN_sub_group.add_argument("--stp_cov", default=0.3, type=float, help="stp.hmm HMMER Coverage val") + dbCAN_sub_group.add_argument( + "--stp_cpu", default=8, type=int, help="stp.hmm Number of CPU cores that HMMER is allowed to use" + ) + + ### cgc finder + cgcfinder_group = parser.add_argument_group("CGC_Finder parameters") + cgcfinder_group.add_argument( + "--cluster", + "-c", + help="Predict CGCs via CGCFinder. This argument requires an auxillary locations file if a protein input is being used", + ) + cgcfinder_group.add_argument("--cgc_dis", default=2, type=int, help="CGCFinder Distance value") + cgcfinder_group.add_argument( + "--cgc_sig_genes", + default="tp", + choices=["tf", "tp", "stp", "tp+tf", "tp+stp", "tf+stp", "all"], + help="CGCFinder Signature Genes value", + ) + + ### cgc substrate prediction + cgcsubstrate_group = parser.add_argument_group("CGC_Substrate parameters") + cgcsubstrate_group.add_argument('--only_sub',action='store_false',help="Only run substrate prediction for PUL. If this parameter is presented, dbcan will skip the CAZyme annotation and CGC prediction.") + cgcsubstrate_group.add_argument("--cgc_substrate", action="store_true", help="run cgc substrate prediction?") + cgcsubstrate_group.add_argument("--pul", help="dbCAN-PUL PUL.faa") + cgcsubstrate_group.add_argument("-o", "--out", default="sub.prediction.out") + cgcsubstrate_group.add_argument("-w", "--workdir", type=str, default=".") + cgcsubstrate_group.add_argument("-env", "--env", type=str, default="local") + cgcsubstrate_group.add_argument( + "-odbcan_sub", "--odbcan_sub", action="store_true", help="Output dbCAN-sub prediction intermediate result? for debug" + ) + cgcsubstrate_group.add_argument( + "-odbcanpul", "--odbcanpul", action="store_true", help="Output dbCAN-PUL prediction intermediate result? for debug." + ) + + ### cgc substrate prediction:dbCAN-PUL + group1 = parser.add_argument_group( + "dbCAN-PUL homologous searching parameters", "how to define homologous gene hits and PUL hits" + ) + group1.add_argument("-upghn", "--uniq_pul_gene_hit_num", default=2, type=int) + group1.add_argument("-uqcgn", "--uniq_query_cgc_gene_num", default=2, type=int) + group1.add_argument("-cpn", "--CAZyme_pair_num", default=1, type=int) + group1.add_argument("-tpn", "--total_pair_num", default=2, type=int) + group1.add_argument( + "-ept", "--extra_pair_type", default=None, type=str, help="None[TC-TC,STP-STP]. Some like sigunature hits" + ) + group1.add_argument( + "-eptn", "--extra_pair_type_num", default="0", type=str, help="specify signature pair cutoff.1,2" + ) + group1.add_argument( + "-iden", "--identity_cutoff", default=0.3, type=float, help="identity to identify a homologous hit" + ) + group1.add_argument( + "-cov", "--coverage_cutoff", default=0.3, type=float, help="query coverage cutoff to identify a homologous hit" + ) + group1.add_argument( + "-bsc", "--bitscore_cutoff", default=50, type=float, help="bitscore cutoff to identify a homologous hit" + ) + group1.add_argument( + "-evalue", "--evalue_cutoff", default=0.01, type=float, help="evalue cutoff to identify a homologous hit" + ) + + ### cgc substrate prediction:dbCAN-sub + group2 = parser.add_argument_group( + "dbCAN-sub major voting parameters", "how to define dbsub hits and dbCAN-sub subfamily substrate" + ) + group2.add_argument("-hmmcov", "--hmmcov", default=0.3, type=float) + group2.add_argument("-hmmevalue", "--hmmevalue", default=0.01, type=float) + group2.add_argument( + "-ndsc", + "--num_of_domains_substrate_cutoff", + default=2, + type=int, + help="define how many domains share substrates in a CGC, one protein may include several subfamily domains.", + ) + group2.add_argument( + "-npsc", + "--num_of_protein_substrate_cutoff", + default=2, + type=int, + help="define how many sequences share substrates in a CGC, one protein may include several subfamily domains.", + ) + group2.add_argument( + "-subs", + "--substrate_scors", + default=2, + type=int, + help="each cgc contains with substrate must more than this value", + ) + + return parser + + +# Putting the ArgumentParser in this block allows the script to be called from command line as before, while +# allowing the main function to be called directly from other scripts without invoking a subprocess. This prevents extra +# subprocesses or extra python interpreters being spawned, as well as simplifying python scripts which call run_dbcan. +def cli_main(): + + parser = rundbCAN_parser() + args = parser.parse_args() + + ### rundbCAN3 + if args.only_sub: + run_dbCAN( + inputFile=args.inputFile, + inputType=args.inputType, + cluster=args.cluster, + dbCANFile=args.dbCANFile, + dia_eval=args.dia_eval, + dia_cpu=args.dia_cpu, + hmm_eval=args.hmm_eval, + hmm_cov=args.hmm_cov, + hmm_cpu=args.hmm_cpu, + dbcan_thread=args.dbcan_thread, + tf_eval=args.tf_eval, + tf_cov=args.tf_cov, + tf_cpu=args.tf_cpu, + stp_eval=args.stp_eval, + stp_cov=args.stp_cov, + stp_cpu=args.stp_cpu, + prefix=args.out_pre, + outDir=args.out_dir, + dbDir=args.db_dir, + cgc_dis=args.cgc_dis, + cgc_sig_genes=args.cgc_sig_genes, + tool_arg=args.tools, + use_signalP=args.use_signalP, + signalP_path=args.signalP_path, + gram=args.gram, + ) + + ### convert cgc_standard.out to json format + + if args.cluster: ### run cgc_finder + os.system(f"cgc_standard2json -i {args.out_dir}/cgc_standard.out -o {args.out_dir}/cgc_standard.out.json") + ### substarate prediction + if args.cgc_substrate: + cgc_substrate_prediction(args) + + +if __name__ == "__main__": + cli_main() diff --git a/dbcan_cli/syntenic_plot.py b/dbcan/cli/syntenic_plot.py similarity index 88% rename from dbcan_cli/syntenic_plot.py rename to dbcan/cli/syntenic_plot.py index e52e3171a..d2fdd45a4 100644 --- a/dbcan_cli/syntenic_plot.py +++ b/dbcan/cli/syntenic_plot.py @@ -9,9 +9,9 @@ from matplotlib import pyplot from matplotlib.patches import Patch import matplotlib -matplotlib.use('Agg') +#matplotlib.use('Agg') -plt.style.use('ggplot') +#plt.style.use('ggplot') import argparse,os @@ -97,46 +97,57 @@ def Get_Position(starts,ends,strands,maxbp,yshift=0,up=1): positions_str += str( ends[i] * pixeachbp - Triangle_length) + " " + str(plot_start_y) + " "## second point if up == 1: ###cluster 1 blocks.append(positions_str) - positions_str += str( ends[i] * pixeachbp) + " " + str(plot_start_y + poly_heigth) + " " ## 3 positions_str += str( ends[i] * pixeachbp - Triangle_length) + " " + str( plot_start_y + 2*poly_heigth) + " " ### 4 positions_str += str( starts[i] * pixeachbp )+ " " + str(plot_start_y + 2*poly_heigth) positions_str1 = str( starts[i] * pixeachbp )+ " " + str(plot_start_y + 2*poly_heigth) + " " positions_str1 += str( ends[i] * pixeachbp - Triangle_length) + " " + str( plot_start_y + 2*poly_heigth) + " " ### 5 - if up == 2: ### cluster 2 blocks.append(positions_str1) - if strands[i] == "-": positions_str = str( starts[i] * pixeachbp ) + " " + str(plot_start_y + poly_heigth) + " " - positions_str += str( starts[i] * pixeachbp + Triangle_length) + " " + str(plot_start_y) + " " positions_str += str(ends[i] * pixeachbp) + " " + str(plot_start_y) + " " - positions_str1 = str(ends[i] * pixeachbp) + " " + str(plot_start_y) + " " positions_str1 += str( starts[i] * pixeachbp + Triangle_length) + " " + str(plot_start_y) + " " if up == 1: blocks.append(positions_str1) - positions_str += str( ends[i] *pixeachbp ) + " " + str(plot_start_y + 2* poly_heigth) + " " positions_str += str( starts[i]* pixeachbp +Triangle_length) + " " + str(plot_start_y + 2* poly_heigth) - positions_str1 = str( ends[i] *pixeachbp ) + " " + str(plot_start_y + 2* poly_heigth) + " " positions_str1 += str( starts[i]* pixeachbp +Triangle_length) + " " + str(plot_start_y + 2* poly_heigth) if up == 2: blocks.append(positions_str1) - #print (positions_str) polygens.append(positions_str) - ### for genome line if i < len(starts) -1: positions_str = str( ends[i] *pixeachbp) + " " + str(plot_start_y + poly_heigth) + " " positions_str += str( starts[i+1]*pixeachbp) + " " + str(plot_start_y + poly_heigth) lines.append(positions_str) - - return polygens,blocks,lines + + scale_number = 10 + each_scale_bp = maxbp / scale_number + each_scale_pix = each_scale_bp * pixeachbp + + plot_start_y -= 30 + scale_positions = []; scale_positions_texts = [] ; scale_text = [] + scale_positions.append("0 " + str(plot_start_y + 3*poly_heigth) + " " + str(10*each_scale_pix) + " " + str(plot_start_y + 3*poly_heigth)) + + plot_start_y -= 1 + for i in range(scale_number+1): + positions_str = str(i*each_scale_pix) + " " + positions_str += str(plot_start_y + 3* poly_heigth) + " " + positions_str += str(i*each_scale_pix) + " " + positions_str += str(plot_start_y + 3*poly_heigth + 0.6* poly_heigth) + scale_positions.append(positions_str) + positions_str = str(i*each_scale_pix) + " " + str(plot_start_y + 3*poly_heigth + 0.6* poly_heigth) + scale_positions_texts.append(positions_str) + scale_text.append(str(int(each_scale_bp*i)+ shfit_pos)) + #print(scale_positions) + #print(scale_text) + return polygens,blocks,lines,scale_positions,scale_text def plot_Polygon_homologous(polygens1,polygens2,types1,types2,size,ax): @@ -149,7 +160,7 @@ def plot_Polygon_homologous(polygens1,polygens2,types1,types2,size,ax): for i in range(int(len(polygen)/2)): points.append([float(polygen[2*i]),float(polygen[2*i+1])]) ax.add_patch( - Polygon(points, color=color, alpha=0.5) + Polygon(points, color=color, alpha=0.5,lw=0) ) for j in range(len(polygens2)): @@ -159,7 +170,7 @@ def plot_Polygon_homologous(polygens1,polygens2,types1,types2,size,ax): for i in range(int(len(polygen)/2)): points.append([float(polygen[2*i]),float(polygen[2*i+1])]) ax.add_patch( - Polygon(points, color=color, alpha=0.5) + Polygon(points, color=color, alpha=0.5,lw=0) ) @@ -186,7 +197,7 @@ def Shade_curve(x11,x12,y11,y12,x21,x22,y21,y22,xmid,ymid,color): (CP, (x11,y11))] codes, verts = zip(*pathdata) path = Path(verts, codes) - pp = PathPatch(path,color=color,alpha=0.5,lw=0) + pp = PathPatch(path,color=color,alpha=0.2,lw=0) return pp @@ -220,10 +231,10 @@ def plot_genome_line(lines_coor1,lines_coor2,ax): for line in lines_coor1: x1,y1,x2,y2 = points2(line) - ax.add_patch(Polygon([(x1,y1),(x2,y2)], color="black",lw=2)) + ax.add_patch(Polygon([(x1,y1),(x2,y2)], color="gray",lw=2)) for line in lines_coor2: x1,y1,x2,y2 = points2(line) - ax.add_patch(Polygon([(x1,y1),(x2,y2)], color="black", lw=2)) + ax.add_patch(Polygon([(x1,y1),(x2,y2)], color="gray", lw=2)) ### input: gene cluster1: all starts coordinate, end coodinate, strands,Types, ### input: gene cluster2: all starts coordinate, end coodinate, strands,Types, @@ -284,8 +295,8 @@ def syntenic_plot(starts,starts1,ends,ends1,strands,strands1,Types,Types1,blocks ### decide which maxbp = max([max(ends) - min(starts),max(ends1) - min(starts1)]) - polygens,blocks_coor,lines_coor = Get_Position(starts,ends,strands,maxbp,yshift=0,up=1) - polygens1,blocks1_coor,lines_coor1 = Get_Position(starts1,ends1,strands1,maxbp,yshift=60,up=2) + polygens,blocks_coor,lines_coor,_,_ = Get_Position(starts,ends,strands,maxbp,yshift=0,up=1) + polygens1,blocks1_coor,lines_coor1,_,_ = Get_Position(starts1,ends1,strands1,maxbp,yshift=40,up=2) ### @@ -309,12 +320,14 @@ def syntenic_plot(starts,starts1,ends,ends1,strands,strands1,Types,Types1,blocks plt.text(500,90,cgcid,fontsize=30,horizontalalignment='center') plt.text(500,0,pulid,fontsize=30,horizontalalignment='center') plt.ylim(0,100) - plt.xlim(-100,1000) + plt.xlim(-100,1100) plt.axis('off') ax.plot() plt.tight_layout(pad=0.01) cgcid = cgcid.replace("|","_") ### need to replace "|" to "_", because | is a special chara for system - plt.savefig(f"syntenic.svg/{cgcid}.svg") + ### for local + #print(f"Save figure to file synteny.pdf/{cgcid}-syntenic.pdf ") + plt.savefig(f"synteny.pdf/{cgcid}-syntenic.pdf") plt.close() def read_blast_result_cgc(filename): @@ -336,7 +349,7 @@ def syntenic_plot_allpairs(args): cgc_proteinid2gene,cgcid2gene,cgcid2geneid = read_UHGG_CGC_stanrdard_out(args.cgc) PULid_proteinid2gene,PULid2gene,PULid2geneid = read_PUL_cgcgff(args) - os.makedirs("syntenic.svg", exist_ok=True) + os.makedirs("synteny.pdf", exist_ok=True) for line in open(args.input).readlines()[1:]: ### for each pairs lines = line.rstrip().split("\t") @@ -409,6 +422,8 @@ def read_PUL_cgcout(filename="PUL.out"): def read_cgcgff(filename,geneid2gene): + if not os.path.exists(filename): + return None for line in open(filename): lines = line.rstrip("\n").split("\t") proteinid = attribution(lines[-1],"ID") diff --git a/dbcan/utils/CGCFinder.py b/dbcan/utils/CGCFinder.py index 6e4f861e5..5373ff0f3 100755 --- a/dbcan/utils/CGCFinder.py +++ b/dbcan/utils/CGCFinder.py @@ -9,13 +9,12 @@ # # # Last updated 12/24/18 -#-updating info -#-adding stp +# -updating info +# -adding stp ################################################## - -#set up argument parser +# set up argument parser # parser = argparse.ArgumentParser(description='CAZyme Gene Cluster Finder') # parser.add_argument('gffFile', help='GFF file containing genome information') @@ -29,165 +28,402 @@ # #open output file # out = open(args.output, 'w+') -#global vars -cluster = [0, 0, 0, 0] #cazyme, tp, tf, stp +# global vars +import os +cluster = [0, 0, 0, 0] # cazyme, tp, tf, stp num_clusters = 0 -#define boolean function to determine if a cluster meets cluster requirements -def isCluster(siggenes): - global cluster - if siggenes == 'all': - if cluster[0] > 0 and cluster[1] > 0 and cluster[2] > 0 and cluster[3]: - return True - elif siggenes == 'tf': - if cluster[0] > 0 and cluster[2] > 0: - return True - elif siggenes == 'tp': - if cluster[0] > 0 and cluster[1] > 0: - return True - elif siggenes == 'stp': - if cluster[0] > 0 and cluster[3] > 0: - return True - elif siggenes == 'tp+tf': - if cluster[0] > 0 and cluster[1] > 0 and cluster[2]>0: - return True - elif siggenes == 'tp+stp': - if cluster[0] > 0 and cluster[1] > 0 and cluster[3]>0: - return True - elif siggenes == 'tf+stp': - if cluster[0] > 0 and cluster[2] > 0 and cluster[3]>0: - return True - else: - print('Warning: invalid siggenes argument') - return False - -#define boolean function to detemine if a gene is important (a signature gene) +# define boolean function to determine if a cluster meets cluster requirements +def isCluster(siggenes): + """ + Determines if a cluster of genes meets the criteria based on signature genes. + + This function checks if the current gene cluster configuration meets the specified criteria for being considered a significant cluster, depending on the signature genes specified. + + Parameters + ---------- + siggenes (str): The type of signature genes to consider (e.g., 'all', 'tf', 'tp', 'stp', etc.). + + Returns + ------- + bool: True if the cluster meets the criteria, False otherwise. + + Note: + This function uses a global variable `cluster` to access the current state of the gene cluster. + """ + global cluster + if siggenes == "all": + if cluster[0] > 0 and cluster[1] > 0 and cluster[2] > 0 and cluster[3]: + return True + elif siggenes == "tf": + if cluster[0] > 0 and cluster[2] > 0: + return True + elif siggenes == "tp": + if cluster[0] > 0 and cluster[1] > 0: + return True + elif siggenes == "stp": + if cluster[0] > 0 and cluster[3] > 0: + return True + elif siggenes == "tp+tf": + if cluster[0] > 0 and cluster[1] > 0 and cluster[2] > 0: + return True + elif siggenes == "tp+stp": + if cluster[0] > 0 and cluster[1] > 0 and cluster[3] > 0: + return True + elif siggenes == "tf+stp": + if cluster[0] > 0 and cluster[2] > 0 and cluster[3] > 0: + return True + else: + print("Warning: invalid siggenes argument") + return False + + +# define boolean function to detemine if a gene is important (a signature gene) def isImportant(gene, siggenes): - if gene == 'CAZyme': - return True - else: - if gene == 'TC' and (siggenes == 'tp' or siggenes == 'all' or siggenes == 'tp+tf' or siggenes == 'tp+stp'): - return True - if gene == 'TF' and (siggenes == 'tf' or siggenes == 'all' or siggenes == 'tp+tf' or siggenes == 'tf+stp'): - return True - if gene == 'STP' and (siggenes == 'stp' or siggenes == 'all'or siggenes == 'tp+stp' or siggenes == 'tf+stp'): - return True - return False + """ + Determines if a gene is important based on its type and the specified signature genes. + + Parameters + ---------- + gene (str): The type of the gene (e.g., 'CAZyme', 'TC', 'TF', 'STP'). + siggenes (str): The type of signature genes to consider (e.g., 'all', 'tp', 'tf', 'stp', etc.). + + Returns + ------- + bool: True if the gene is considered important, False otherwise. + """ + if gene == "CAZyme": + return True + else: + if gene == "TC" and (siggenes == "tp" or siggenes == "all" or siggenes == "tp+tf" or siggenes == "tp+stp"): + return True + if gene == "TF" and (siggenes == "tf" or siggenes == "all" or siggenes == "tp+tf" or siggenes == "tf+stp"): + return True + if gene == "STP" and (siggenes == "stp" or siggenes == "all" or siggenes == "tp+stp" or siggenes == "tf+stp"): + return True + return False + def isSigGene(gene): - if gene == 'CAZyme' or gene == 'TC' or gene == 'TF' or gene == 'STP': - return True - else: - return False - -#define function to increase the cluster count + """ + Determines if a gene is a signature gene. + + Parameters + ---------- + gene (str): The type of the gene (e.g., 'CAZyme', 'TC', 'TF', 'STP'). + + Returns + ------- + bool: True if the gene is a signature gene, False otherwise. + """ + if gene == "CAZyme" or gene == "TC" or gene == "TF" or gene == "STP": + return True + else: + return False + + +# define function to increase the cluster count def increaseClusterCount(gene): - global cluster - if gene == 'CAZyme': - cluster[0] += 1 - elif gene == 'TC': - cluster[1] += 1 - elif gene == 'TF': - cluster[2] += 1 - elif gene == 'STP': - cluster[3] += 1 - else: - print("Warning: increaseClusterCount was called on bad functional domain") - -#define function to search for a cluster once an important gene has been found -#this function also handles output + """ + Increases the count of a specific gene type in the global cluster count. + + Parameters + ---------- + gene (str): The type of the gene (e.g., 'CAZyme', 'TC', 'TF', 'STP'). + + Note: + This function uses a global variable `cluster` to update the count of each gene type in the cluster. + """ + global cluster + if gene == "CAZyme": + cluster[0] += 1 + elif gene == "TC": + cluster[1] += 1 + elif gene == "TF": + cluster[2] += 1 + elif gene == "STP": + cluster[3] += 1 + else: + print("Warning: increaseClusterCount was called on bad functional domain") + + +# define function to search for a cluster once an important gene has been found +# this function also handles output def startSearch(startRow, contig, distance, siggene, out): - global cluster - global num_clusters - dis = distance - index = startRow - between = 0 - lastImportant = 0 - while index < len(contig): - index += 1 - fd = contig[index][2] - if isImportant(fd, siggene): - increaseClusterCount(fd) - lastImportant = index - between = 0 - else: - between += 1 - if between > dis or index >= (len(contig)-1): - if isCluster(siggene): - num_clusters += 1 - #output file columns - #geneNumber type[2] downDis upDis CGC# contig[0] geneStart[3] geneEnd[4] geneID[8,ID] direc[6] note[8] - for j in range(startRow, lastImportant + 1): - fd = contig[j][2] - if isSigGene(fd): - upDown = findNear(contig, j, siggene) - notes = contig[j][8].split(";") - ID= "" - for note in notes: - if "ID" in note: - ID = note.split("=")[1] - row = [str(j), fd, str(upDown[1]), str(upDown[0]), 'CGC'+str(num_clusters), contig[j][0], contig[j][3], contig[j][4], ID, contig[j][6], contig[j][8]] - else: - row = [str(j), 'null', 'null', 'null', 'CGC'+str(num_clusters), contig[j][0], contig[j][3], contig[j][4], ID, contig[j][6]] - try: - row.append(contig[j][8]) - except: - pass - out.write('\t'.join(row) + '\n') - out.write('+++++' + '\n') - cluster = [0, 0, 0, 0] - return index - -#define function to find how close important genes are to each other + """ + Searches for a gene cluster starting from a specific row in a contig. + + This function initiates the search for a gene cluster in a contig, beginning from the specified row. It also handles outputting the cluster details. + + Parameters + ---------- + startRow (int): The starting row index for the search. + contig (list): The contig data. + distance (int): The maximum distance between significant genes. + siggene (str): Type of significant genes to consider. + out (file object): The output file object to write the cluster information. + + Returns + ------- + int: The index where the search in the contig should continue. + + Note: + This function uses global variables `cluster` and `num_clusters`. + """ + global cluster + global num_clusters + dis = distance + index = startRow + between = 0 + lastImportant = 0 + while index < len(contig): + index += 1 + fd = contig[index][2] + if isImportant(fd, siggene): + increaseClusterCount(fd) + lastImportant = index + between = 0 + else: + between += 1 + if between > dis or index >= (len(contig) - 1): + if isCluster(siggene): + num_clusters += 1 + # output file columns + # geneNumber type[2] downDis upDis CGC# contig[0] geneStart[3] geneEnd[4] geneID[8,ID] direc[6] note[8] + for j in range(startRow, lastImportant + 1): + fd = contig[j][2] + if isSigGene(fd): + upDown = findNear(contig, j, siggene) + notes = contig[j][8].split(";") + ID = "" + for note in notes: + if "ID" in note: + ID = note.split("=")[1] + row = [ + str(j), + fd, + str(upDown[1]), + str(upDown[0]), + "CGC" + str(num_clusters), + contig[j][0], + contig[j][3], + contig[j][4], + ID, + contig[j][6], + contig[j][8], + ] + else: + row = [ + str(j), + "null", + "null", + "null", + "CGC" + str(num_clusters), + contig[j][0], + contig[j][3], + contig[j][4], + ID, + contig[j][6], + ] + try: + row.append(contig[j][8]) + except KeyError as e: + print("KeyError: ", e) + out.write("\t".join(row) + "\n") + out.write("+++++" + "\n") + cluster = [0, 0, 0, 0] + return index + + +# define function to find how close important genes are to each other def findNear(contig, index, siggene): - vals = ['null', 'null'] - k = index - 1 - l = index + 1 - while k >= 0: - if isImportant(contig[k][2], siggene): - vals[0] = index - k - 1 - break - else: - k -= 1 - while l <= len(contig) - 1: - if isImportant(contig[l][2], siggene): - vals[1] = l - index - 1 - break - else: - l += 1 - return vals + """ + Finds the distance to the nearest significant genes in a contig. + + Parameters + ---------- + contig (list): The contig data. + index (int): The current index in the contig. + siggene (str): Type of significant genes to consider. + + Returns + ------- + list: A list containing distances to the nearest significant genes. + """ + vals = ["null", "null"] + k = index - 1 + l = index + 1 + while k >= 0: + if isImportant(contig[k][2], siggene): + vals[0] = index - k - 1 + break + else: + k -= 1 + while l <= len(contig) - 1: + if isImportant(contig[l][2], siggene): + vals[1] = l - index - 1 + break + else: + l += 1 + return vals def cgc_finder(gffFile, distance, siggenes, output): - global cluster - global num_clusters - - #open output file - out = open(output, 'w+') - - #load contig into an array - contigs = {} - with open(gffFile) as f: - for line in f: - row = line.rstrip().split('\t') - if row[0] not in contigs: - contigs[row[0]] = [] - contigs[row[0]].append(row) - - #loop through contig - for key in contigs: - contig = contigs[key] - num_clusters = 0 - i = 0 - while i < len(contig) - 1: - fd = contig[i][2] - - if isImportant(fd, siggenes): - increaseClusterCount(fd) - i = startSearch(i, contig, distance, siggenes, out) - else: - i += 1 - - if output != 'none': - out.close() + """ + Performs CGC (Conserved Gene Cluster) finding on a given GFF file. + + This function reads a GFF file, processes it to find conserved gene clusters based on specified criteria, and writes the results to an output file. + + Parameters + ---------- + gffFile (str): The path to the GFF file. + distance (int): The maximum distance between significant genes to consider a cluster. + siggenes (str): The type of significant genes to consider for clustering. + output (str): The path to the output file where the results will be written. + + Side Effects: + - Reads from a specified GFF file. + - Writes the found clusters to an output file. + + Note: + This function uses global variables `cluster` and `num_clusters`. + """ + global cluster + global num_clusters + + # open output file + out = open(output, "w+") + + # load contig into an array + contigs = {} + with open(gffFile) as f: + for line in f: + row = line.rstrip().split("\t") + if row[0] not in contigs: + contigs[row[0]] = [] + contigs[row[0]].append(row) + + # loop through contig + for key in contigs: + contig = contigs[key] + num_clusters = 0 + i = 0 + while i < len(contig) - 1: + fd = contig[i][2] + + if isImportant(fd, siggenes): + increaseClusterCount(fd) + i = startSearch(i, contig, distance, siggenes, out) + else: + i += 1 + + if output != "none": + out.close() + +######################################################### +# Written by Qiwei Ge in Yin Lab at UNL +# A readable version of cgc +######################################################### + +def simplify_cgc_output(inFile): + ''' + Simplify the CGC output to make it as readable format. + Add title for CGC + Parameters + ---------- + Input file: inFile (default: cgc.out) + Output: + --------- + output file: cgc_standard.out + + It contains title: CGC#, Gene Type, Contig ID, Protein ID, Gene Start, + Gene Stop, Direction, Protein Family + ''' + + try: + text = open(inFile).readlines() + text = [line.strip() for line in text] + except: + print("fail to read") + exit(-1) + dir = os.path.dirname(inFile) + # os.remove(dir+'cgc.out') + annotation = '' + if '' in text: + text.remove('') + with open(dir + '/cgc_standard.out', 'a') as f: + f.write("CGC#\tGene Type\tContig ID\tProtein ID\tGene Start\tGene Stop\tDirection\tProtein Family\n") + f.close() + for i in range(len(text)): + simplified_line = [] + if '+++++' not in text[i]: + each_line = text[i].split('\t') + simplified_line.append(each_line[4]) + simplified_line.append(each_line[1]) + simplified_line.append(each_line[5]) + simplified_line.append(each_line[8]) + simplified_line.append(each_line[6]) + simplified_line.append(each_line[7]) + simplified_line.append(each_line[9]) + + if '' in each_line: + each_line.remove('') + if 'TC' in each_line[1]: # need fix split order if needed + annotation = each_line[10].split('|')[3].split(';')[0] + simplified_line.append(annotation) + elif 'CAZyme' in each_line[1]: # need fix split order if needed + annotation = each_line[10].split(';')[0].split('=')[1] + simplified_line.append(annotation) + elif 'STP' in each_line[1]: # need fix split order if needed + pre_annotation = each_line[10].split(';')[0].split('=')[1].split('|') + STP_counter = 0 + for i in pre_annotation: #hard code + if 'STP' in i: + STP_counter = STP_counter + 1 + if STP_counter > 1: + pre_annotation = each_line[10].split(';')[0].split('=')[1].split(',') + STP_list = [] + for STP in pre_annotation: + if 'STP' in STP: + STP_list.append(STP.split('|')[1]) + annotation = ('+').join(STP_list) + elif STP_counter == 1: + annotation = each_line[10].split(';')[0].split('=')[1].split('|')[1] + else: + annotation = 'none' + simplified_line.append(annotation) + elif 'TF' in each_line[1]: + pre_annotation = each_line[10].split(';')[0].split('=')[1].split('|') + TF_counter = 0 + for i in pre_annotation: #hard code + if 'DBD-Pfam' in i: + TF_counter = TF_counter + 1 + if TF_counter > 1: + pre_annotation = each_line[10].split(';')[0].split('=')[1].split(',') + TF_list = [] + for TF in pre_annotation: + if 'DBD-Pfam' in TF: + TF_list.append(TF.split('|')[1]) + annotation = ('+').join(TF_list) + elif TF_counter == 1: + annotation = each_line[10].split(';')[0].split('=')[1].split('|')[1] + if 'DBD-SUPERFAMILY' in annotation: + annotation = annotation.split(',')[0] + else: + annotation = 'none' + simplified_line.append(annotation) + elif 'null' in each_line[1]: + simplified_line[3] = each_line[10].split('=')[1] + annotation = 'null' + simplified_line.append(annotation) + else: + annotation = 'empty line' + simplified_line.append(annotation) + simplified_line = '\t'.join(simplified_line) + + with open(dir + "/cgc_standard.out", 'a') as f: + f.write(simplified_line+'\n') + f.close() + else: + pass \ No newline at end of file diff --git a/dbcan/utils/__init__.py b/dbcan/utils/__init__.py index a66c39c34..9eefa9b9c 100644 --- a/dbcan/utils/__init__.py +++ b/dbcan/utils/__init__.py @@ -1,2 +1 @@ -from .simplify_cgc import simplify_output -from .CGCFinder import cgc_finder \ No newline at end of file +from .CGCFinder import cgc_finder, simplify_cgc_output \ No newline at end of file diff --git a/dbcan/utils/cgc_substrate_prediction.py b/dbcan/utils/cgc_substrate_prediction.py index 43add7fe7..fc59817ce 100644 --- a/dbcan/utils/cgc_substrate_prediction.py +++ b/dbcan/utils/cgc_substrate_prediction.py @@ -256,20 +256,13 @@ def GeneID2gene(self): self.geneid2gene = geneid2gene return geneid2gene - #def GeneID2substrate(self): - # geneid2sub = {}; geneid2subfam = {}; geneid2EC = {} - # for gene in self: - # geneid2sub.setdefault(gene.GeneID,[]).append(gene.Substrate) - # geneid2sub.setdefault(gene.GeneID,[]).append(gene.eCAMI_subfam) - # geneid2sub.setdefault(gene.GeneID,[]).append(gene.Subfam_EC) - # return geneid2gene class dbSub_record(object): ''' design for dbCAN_sub output, each line ''' def __init__(self,lines): - self.eCAMI_subfam = lines[0] + self.dbcan_sub_subfam = lines[0] self.Subfam_Composition = lines[1] self.Subfam_EC = lines[2] self.Substrate = lines[3] if lines[3]!= "-" else "" @@ -298,7 +291,7 @@ class dbCAN_substrate_prediction(object): ''' design for running substrate prediciton process with the output of dbCAN3. Run the following step. Step 1: run dbCAN-PUL searcing substrate prediction. -> dbCAN_PUL_substrate_predict - Step 2: run eCAMI-family substrate prediciton. -> eCAMI_subfamily_substrate_prediction + Step 2: run dbCAN_sub-family substrate prediciton. -> dbcan_sub_subfamily_substrate_prediction Step 3: combine eCAM-family and dbCAN-PUL substrate prediciton. ''' @@ -310,7 +303,7 @@ def __init__(self,args): self.input_folder = args.input if args.input.endswith("/") else args.input +"/" self.cgc_out = self.input_folder + "cgc.out" self.cgc_standard_out = self.input_folder + "cgc_standard.out" - self.dbsub_out = self.input_folder +"dbsub.out" + self.dbsub_out = self.input_folder +"dbcan-sub.hmm.out" self.overview_txt = self.input_folder +"overview.txt" self.protein_db = self.input_folder +"uniInput" @@ -320,29 +313,29 @@ def __init__(self,args): #self.random_str = "59f220bd5fc4422187679301976a3d76" ## for debug #self.tmp_folder = "/dev/shm/" + self.random_str ### tmp folder to save some tmp files self.tmp_folder = self.input_folder - self.tmp_blastp_out = self.tmp_folder + "blastp.out" - self.tmp_CAZyme_pep = f"{self.tmp_folder}CAZyme.pep" + self.tmp_blastp_out = self.tmp_folder + "PUL_blast.out" + self.tmp_CAZyme_pep = f"{self.tmp_folder}CGC.faa" ### parameters self.PULdb = f"{ROOT_FOLDR}PUL.faa" - self.pul_excel_filename = f"{ROOT_FOLDR}dbCAN-PUL_07-01-2022.xlsx" + self.pul_excel_filename = f"{ROOT_FOLDR}dbCAN-PUL.xlsx" self.homologous_parameters = HitParamter(args) - self.dbsub_parameters = eCAMI_parameter(args) + self.dbsub_parameters = dbcan_sub_parameter(args) ### output parameters, intermediate results - self.odbsub = args.odbsub + self.odbcan_sub = args.odbcan_sub self.odbcanpul = args.odbcanpul self.dbcanpul_tmp = "dbcanpul.tmp.txt" - self.ecamipul_tmp = "ecamipul.tmp.txt" + self.dbcan_sub_tmp = "dbcansubpul.tmp.txt" ### Method to predict substrate self.run_dbCAN_sub = True self.run_dbCAN_PUL = True ### - self.eCAMI_CGC2substrates = {} + self.dbcan_sub_CGC2substrates = {} self.queryCGC2hit = {} - self.eCAMI_CGC2maxscore = {} + self.dbcan_sub_CGC2maxscore = {} def check_input(self): ''' @@ -404,7 +397,10 @@ def do_blastp_against_dbCANPUL(self): SeqIO.write(self.seqs,self.tmp_CAZyme_pep,'fasta') outfmt = '"6 qseqid sseqid pident length mismatch gapopen qstart qend sstart send evalue bitscore qlen slen"' - self.blastp_command = f"blastp -max_hsps 1 -query {self.tmp_CAZyme_pep} -db {self.PULdb} -outfmt {outfmt} -evalue 0.01 -out {self.tmp_blastp_out} -num_threads 6 " + self.blastp_command = f"blastp -max_hsps 1 -query {self.tmp_CAZyme_pep} -db {self.PULdb} -outfmt {outfmt} -evalue 0.01 -out {self.tmp_blastp_out} -num_threads 32 " + print(self.blastp_command) + print("[whether PUL db exists]", os.path.exists(self.PULdb)) + ### checking the blastp out if not os.path.exists(self.tmp_blastp_out): @@ -491,7 +487,7 @@ def Uniq_blastp_hit(self,blast_list): else: return -1,homologous_pairs - def eCAMI_read_cgc(self): + def dbcan_sub_read_cgc(self): if not self.cgcid2cgc: ### need to read cgc self.dbCAN_hits = dbCAN_Out(self.cgc_standard_out) @@ -629,26 +625,25 @@ def Read_CAZyme_substrate(self): #print(cgcid,gene.Protein_ID,self.geneid2dbsub[gene.Protein_ID]) - def eCAMI_subfamily_substrate_prediction(self): + def dbcan_sub_subfamily_substrate_prediction(self): ''' - substrate prediction based on eCAMI subfamily. + substrate prediction based on dbCAN-sub subfamily. this part includes 4 steps. Step 1: reading dbsub prediciton result -> Read_CAZyme_substrate - Step 2: reading cgc result -> eCAMI_read_cgc - Step 3: combine dbsub result and cgc result. -> CGC2substrate_eCAMI - Step 4: scoring the CGC substrate predicted by eCAMI, and get the best substrate -> substrate_scoring_eCAMI + Step 2: reading cgc result -> dbcan_sub_read_cgc + Step 3: combine dbsub result and cgc result. -> CGC2substrate_dbcan_sub + Step 4: scoring the CGC substrate predicted by dbCAN-sub, and get the best substrate -> substrate_scoring_dbcan_sub ''' self.Read_CAZyme_substrate() - self.eCAMI_read_cgc() - self.CGC2substrate_eCAMI() - self.substrate_scoring_eCAMI() - #self.eCAMI_sub_print_result() + self.dbcan_sub_read_cgc() + self.CGC2substrate_dbcan_sub() + self.substrate_scoring_dbcan_sub() - def substrate_scoring_eCAMI(self): - print("Start eCAMI subfamily substrate scoring") + def substrate_scoring_dbcan_sub(self): + print("Start dbCAN-sub subfamily substrate scoring") finalsub = {}; finalscores = {}; finalranks = {}; finalmaxscore = {} - cgcid2sub = self.cgcid2substrate_eCAMI + cgcid2sub = self.cgcid2substrate_dbcan_sub ### cgcid map to dbsub records, two dimensions list. ### one dimension is from CAZyme to substrate due to one CAZyme can have more than two substrate or two sub-family ### another dimension is from cgc, one cgc includes several CAZymes @@ -684,7 +679,7 @@ def substrate_scoring_eCAMI(self): ranks.append(f"{sub}:{scores[sub]}") finalscores[cgcid] = scores max_score = max(scores.values()) - if max_score < self.dbsub_parameters.eCAMI_substrate_scors: ### subsrate score less than cutoff + if max_score < self.dbsub_parameters.dbcan_substrate_scors: ### subsrate score less than cutoff continue ## next cgc finalmaxscore[cgcid] = max_score final_subs = [] @@ -694,25 +689,25 @@ def substrate_scoring_eCAMI(self): finalsub[cgcid] = ",".join(final_subs) finalranks[cgcid] = ranks - self.eCAMI_CGC2substrates = finalsub ### save the cgc substrate - self.eCAMI_CGC2scores = finalscores ### almost the same as eCAMI_substrate_score - self.eCAMI_substrate_score= finalranks ### - self.eCAMI_CGC2maxscore = finalmaxscore ### max score + self.dbcan_sub_CGC2substrates = finalsub ### save the cgc substrate + self.dbcan_sub_CGC2scores = finalscores ### almost the same as dbcan_sub_substrate_score + self.dbcan_sub_substrate_score= finalranks ### + self.dbcan_sub_CGC2maxscore = finalmaxscore ### max score - def eCAMI_intermediate_file(self): - f = open("ecami.tmp.txt",'w') + def dbcan_sub_intermediate_file(self): + f = open("dbCAN-sub.tmp.txt",'w') geneids_uniq = [] for cgcid in self.cgcid2cgc: - if cgcid in self.eCAMI_CGC2substrates: + if cgcid in self.dbcan_sub_CGC2substrates: for gene in self.cgcid2cgc[cgcid].genes: geneid = gene.Protein_ID if geneid not in geneids_uniq: geneids_uniq.append(geneid) - ECs,subs,esubfam = self.from_geneid_get_ecami(geneid) + ECs,subs,esubfam = self.from_geneid_get_dbcan_sub(geneid) f.write(cgcid+"\t"+geneid+"\t"+",".join(ECs)+"\t"+",".join(subs)+"\t"+",".join(esubfam)+"\n") - def from_geneid_get_ecami(self,geneid): + def from_geneid_get_dbcan_sub(self,geneid): genes = self.geneid2dbsub.get(geneid,"") if not genes: return [],[],[] @@ -720,7 +715,7 @@ def from_geneid_get_ecami(self,geneid): for i,gene in enumerate(genes): ECs.extend(clean_EC(gene)) subs.extend(clean_sub(gene)) - esubfam.append(gene.eCAMI_subfam) + esubfam.append(gene.dbcan_sub_subfam) ECs = set(ECs) subs = set(subs) return ECs,subs,esubfam @@ -728,30 +723,29 @@ def from_geneid_get_ecami(self,geneid): #print(geneid,ECs,subs,esubfam) #ECs = set([gene.Subfam_EC.split(":")[0] for gene in genes]) #subs = set([gene.Substrate.strip(" ") for gene in genes]) - #esubfam = [gene.eCAMI_subfam for gene in genes] + #esubfam = [gene.dbcan_sub_subfam for gene in genes] #f.write(geneid+"\t"+",".join(ECs)+"\t"+",".join(subs)+"\t"+",".join(esubfam)+"\n") #f.close() - def eCAMI_sub_print_result(self): - for cgc in self.eCAMI_CGC2substrates: - ### - print(cgc,self.eCAMI_CGC2substrates[cgc]) ### cgcid and substrates - print("\t".join(self.eCAMI_substrate_score[cgc])) ### ranking - #print(self.eCAMI_CGC2scores[cgc]) ### ranking + def dbcan_sub_sub_print_result(self): + for cgc in self.dbcan_sub_CGC2substrates: + print(cgc,self.dbcan_sub_CGC2substrates[cgc]) ### cgcid and substrates + print("\t".join(self.dbcan_sub_substrate_score[cgc])) ### ranking + #print(self.dbcan_sub_CGC2scores[cgc]) ### ranking tmp_lines = "" - for dbsubs in self.cgcid2substrate_eCAMI[cgc]: + for dbsubs in self.cgcid2substrate_dbcan_sub[cgc]: for dbsub in dbsubs: print(dbsub) print("-"*20) - def CGC2substrate_eCAMI(self): + def CGC2substrate_dbcan_sub(self): cgcid2sub = {} ; cgcid2substrate_CAZyme_num = {} for cgcid in self.cgcid2cgc: ## NC_000913.3|CGC26 for gene in self.cgcid2cgc[cgcid]: ### if gene.Protein_ID in self.geneid2dbsub: cgcid2sub.setdefault(cgcid,[]).append(self.geneid2dbsub[gene.Protein_ID]) #print(cgcid,gene.Protein_ID,self.geneid2dbsub[gene.Protein_ID]) - self.cgcid2substrate_eCAMI = cgcid2sub + self.cgcid2substrate_dbcan_sub = cgcid2sub self.cgcid2CAZyme_domain_substrate_num = {} ### count how many sequences in CAZyme has a substrate, and then calcuate the cgc potential substrate number @@ -776,7 +770,7 @@ def substrate_predict(self): if self.run_dbCAN_PUL: self.dbCAN_PUL_substrate_predict() if self.run_dbCAN_sub: - self.eCAMI_subfamily_substrate_prediction() + self.dbcan_sub_subfamily_substrate_prediction() def __del__(self): ''' remove tmp folder @@ -787,7 +781,7 @@ def __del__(self): ## print(f"Rmoving tmp file:{self.tmp_folder}") ## shutil.rmtree(self.tmp_folder) - def integrate_dbCANPUL_eCAMI(self): ### maybe need, in the future. + def integrate_dbCANPUL_dbcan_sub(self): ### maybe need, in the future. ''' combine two methods ''' @@ -795,35 +789,35 @@ def integrate_dbCANPUL_eCAMI(self): ### maybe need, in the future. def print_result(self): ### self.queryCGC2hit ### dbCAN-PUL hit - ### self.eCAMI_CGC2substrates ### eCAMI substrate - shared_cgcids = self.queryCGC2hit.keys() | self.eCAMI_CGC2substrates.keys() - print("#cgcid\tPULID\tdbCAN-PUL substrate\tbitscore\tsignarture pairs\teCAMI substrate\teCAMI substrate score") + ### self.dbcan_sub_CGC2substrates ### dbcan_sub substrate + shared_cgcids = self.queryCGC2hit.keys() | self.dbcan_sub_CGC2substrates.keys() + print("#cgcid\tPULID\tdbCAN-PUL substrate\tbitscore\tsignarture pairs\tdbCAN-sub substrate\tdbCAN-sub substrate score") for cgcid in shared_cgcids: dbcan_pul_part = self.queryCGC2hit.get(cgcid,"") - ecami_substate = self.eCAMI_CGC2substrates.get(cgcid,"") + dbcan_sub_substate = self.dbcan_sub_CGC2substrates.get(cgcid,"") PULID = dbcan_pul_part.pulid if dbcan_pul_part else "" dbcan_pul_sub = dbcan_pul_part.substrate if dbcan_pul_part else "" bitscore = dbcan_pul_part.score if dbcan_pul_part else "" sig_pairs = ";".join(dbcan_pul_part.maped_types) if dbcan_pul_part else "" - ecami_maxscore = self.eCAMI_CGC2maxscore.get(cgcid,"") - print(f"{cgcid}\t{PULID}\t{dbcan_pul_sub}\t{bitscore}\t{sig_pairs}\t{ecami_substate}\t{ecami_maxscore}") + dbcan_sub_maxscore = self.dbcan_sub_CGC2maxscore.get(cgcid,"") + print(f"{cgcid}\t{PULID}\t{dbcan_pul_sub}\t{bitscore}\t{sig_pairs}\t{dbcan_sub_substate}\t{dbcan_sub_maxscore}") def result_print_to_file(self): ### self.queryCGC2hit ### dbCAN-PUL hit - ### self.eCAMI_CGC2substrates ### eCAMI substrate - shared_cgcids = self.queryCGC2hit.keys() | self.eCAMI_CGC2substrates.keys() + ### self.dbcan_sub_CGC2substrates ### dbCAN-sub substrate + shared_cgcids = self.queryCGC2hit.keys() | self.dbcan_sub_CGC2substrates.keys() print (f"Writing substrate prediction result to file:{self.input_folder+self.out}") with open(self.input_folder+self.out,'w') as f: - f.write("#cgcid\tPULID\tdbCAN-PUL substrate\tbitscore\tsignature pairs\teCAMI substrate\teCAMI substrate score\n") + f.write("#cgcid\tPULID\tdbCAN-PUL substrate\tbitscore\tsignature pairs\tdbCAN-sub substrate\tdbCAN-sub substrate score\n") for cgcid in shared_cgcids: dbcan_pul_part = self.queryCGC2hit.get(cgcid,"") - ecami_substate = self.eCAMI_CGC2substrates.get(cgcid,"") + dbcan_sub_substate = self.dbcan_sub_CGC2substrates.get(cgcid,"") PULID = dbcan_pul_part.pulid if dbcan_pul_part else "" dbcan_pul_sub = dbcan_pul_part.substrate if dbcan_pul_part else "" bitscore = dbcan_pul_part.score if dbcan_pul_part else "" sig_pairs = ";".join(dbcan_pul_part.maped_types) if dbcan_pul_part else "" - ecami_maxscore = self.eCAMI_CGC2maxscore.get(cgcid,"") - f.write(f"{cgcid}\t{PULID}\t{dbcan_pul_sub}\t{bitscore}\t{sig_pairs}\t{ecami_substate}\t{ecami_maxscore}\n") + dbcan_sub_maxscore = self.dbcan_sub_CGC2maxscore.get(cgcid,"") + f.write(f"{cgcid}\t{PULID}\t{dbcan_pul_sub}\t{bitscore}\t{sig_pairs}\t{dbcan_sub_substate}\t{dbcan_sub_maxscore}\n") class PULhit(object): ''' @@ -857,11 +851,11 @@ def cgc_prediction_webserver(args,sub_pred): ### update parameters sub_pred.tmp_folder = args.workdir ### tmp folder to save some tmp files - sub_pred.tmp_blastp_out = sub_pred.tmp_folder + "blastp.out" - sub_pred.tmp_CAZyme_pep = sub_pred.tmp_folder + "CAZyme.pep" + sub_pred.tmp_blastp_out = sub_pred.tmp_folder + "PUL_blast.out" + sub_pred.tmp_CAZyme_pep = sub_pred.tmp_folder + "CAZyme.faa" sub_pred.PULdb = f"{script_folder}/PUL.faa" - sub_pred.pul_excel_filename = f"{script_folder}/dbCAN-PUL_07-01-2022.xlsx" + sub_pred.pul_excel_filename = f"{script_folder}/dbCAN-PUL.xlsx" ### loading parameters file from php blast.php named with parameters.json parameter_file = "parameters.json" @@ -909,19 +903,19 @@ def cgc_substrate_prediction(args): print(f"Substrate prediciton done! {(time_end-time_start)}s") sub_pred.result_print_to_file() - if sub_pred.odbsub: - sub_pred.eCAMI_intermediate_file() + if sub_pred.odbcan_sub: + sub_pred.dbcan_sub_intermediate_file() time_end = time.time() ### plot the syntenic block - #plot_command = "python3 /array1/www/dbCAN3/ty/syntenic.plot.py syntenic_plot -b blastp.out --cgc cgc_standard.out -i sub.prediction.out" if args.cgc_substrate: os.chdir(args.workdir) - #plot_command = f"python3 {ROOT_FOLDR}/syntenic.plot.py syntenic_plot -b blastp.out --cgc cgc_standard.out -i sub.prediction.out" if args.db_dir.startswith("/"): - plot_command = f"syntenic_plot syntenic_plot -b blastp.out --cgc cgc_standard.out -i sub.prediction.out --db {args.db_dir}" + plot_command = f"syntenic_plot syntenic_plot -b PUL_blast.out --cgc cgc_standard.out -i {args.out} --db {args.db_dir}" else: - plot_command = f"syntenic_plot syntenic_plot -b blastp.out --cgc cgc_standard.out -i sub.prediction.out --db ../{args.db_dir}" + plot_command = f"syntenic_plot syntenic_plot -b PUL_blast.out --cgc cgc_standard.out -i {args.out} --db ../{args.db_dir}" + #print command + print(plot_command) os.system(plot_command) print(f"All done! {(time_end-time_start)}s") @@ -949,13 +943,13 @@ def __init__(self,args): def __repr__(self): return "\n".join([name + ": " +str(self.__dict__[name]) for name in self.__dict__]) -class eCAMI_parameter(object): +class dbcan_sub_parameter(object): def __init__(self,args): self.hmmevalue = args.hmmevalue self.hmmcov = args.hmmcov self.num_of_protein_shared_substrate_cutoff = args.num_of_protein_substrate_cutoff self.num_of_domains_substrate_cutoff = args.num_of_domains_substrate_cutoff - self.eCAMI_substrate_scors = args.substrate_scors + self.dbcan_substrate_scors = args.substrate_scors def __repr__(self): return "\n".join([name + ": " +str(self.__dict__[name]) for name in self.__dict__]) @@ -992,11 +986,11 @@ def parse_argv(): group.add_argument('--pul',help="dbCAN-PUL PUL.faa") group.add_argument('-f','--fasta') group.add_argument('-b','--blastp') - group.add_argument('-o','--out',default="sub.prediction.out") + group.add_argument('-o','--out',default="substrate.out") group.add_argument('-w','--workdir',type=str,default=".") group.add_argument('-rerun','--rerun',type=bool,default=False) group.add_argument('-env','--env',type=str,default="local") - group.add_argument('-odbsub','--odbsub', help="output dbcan_sub prediction intermediate result?") + group.add_argument('-odbcan_sub','--odbcan_sub', help="output dbcan_sub prediction intermediate result?") group.add_argument('-odbcanpul','--odbcanpul',type=bool,default=True,help="output dbCAN-PUL prediction intermediate result?") parser.add_argument('--db_dir', default="db", help='Database directory') @@ -1015,7 +1009,7 @@ def parse_argv(): group1.add_argument('-bsc','--bitscore_cutoff',default = 50,type=float,help="bitscore cutoff to identify a homologous hit") group1.add_argument('-evalue','--evalue_cutoff',default = 0.01,type=float,help="evalue cutoff to identify a homologous hit") - group2 = parser.add_argument_group('eCAMI conditons', 'how to define dbsub hits and eCAMI subfamily substrate') + group2 = parser.add_argument_group('dbCAN-sub conditons', 'how to define dbsub hits and dbCAN-sub subfamily substrate') group2.add_argument('-hmmcov','--hmmcov',default = 0.,type=float) group2.add_argument('-hmmevalue','--hmmevalue',default = 0.01,type=float) group2.add_argument('-ndsc','--num_of_domains_substrate_cutoff',default = 2,type=int,help="define how many domains share substrates in a CGC, one protein may include several subfamily domains.") diff --git a/dbcan/utils/diamond_unassembly.py b/dbcan/utils/diamond_unassembly.py new file mode 100644 index 000000000..9bb3a03d8 --- /dev/null +++ b/dbcan/utils/diamond_unassembly.py @@ -0,0 +1,471 @@ +import os +import sys + +def HLError(mess): + return f"\033[1;31;40m{mess}:\033[0m" +### paf record sample +''' +1 string Query sequence name +2 int Query sequence length +3 int Query start (0-based; BED-like; closed) +4 int Query end (0-based; BED-like; open) +5 char Relative strand: "+" or "-" +6 string Target sequence name +7 int Target sequence length +8 int Target start on original strand (0-based) +9 int Target end on original strand (0-based) +10 int Number of residue matches +11 int Alignment block length +12 int Mapping quality (0-255; 255 for missing) +13 attribute + +''' +### diamond + +''' +1. qseqid query or source (gene) sequence id +2. sseqid subject or target (reference genome) sequence id +3. pident percentage of identical positions +4. length alignment length (sequence overlap) +5. mismatch number of mismatches +6. gapopen number of gap openings +7. qstart start of alignment in query +8. qend end of alignment in query +9. sstart start of alignment in subject +10. send end of alignment in subject +11. evalue expect value +12. bitscore bit score +13. qlen query length +14. slen subject length +''' + +def CAZy_filter(cazy): + return set([aa for aa in cazy]) + #return set([aa.split("_")[0] for aa in cazy]) + +### need to convert to blastp 6 +class PafRecord(object): + def __init__(self,lines): + self.Qsn = lines[0] + self.Qsl = lines[12] + self.Qs = int(lines[6]) -1 + self.Qe = lines[7] + self.Strand = lines[4] + self.Tsn = lines[1] + self.Tsl = lines[13] + self.Ts = int(lines[8]) -1 + self.Te = lines[9] + self.Nrm = lines[11] + self.Abl = lines[3] + self.Mq = lines[10] ### if the paf was converted from sam, Mq here stands for the MAPQ + ### deal information + self.SeqID = self.Tsn.split('|')[0] + self.CAZys = CAZy_filter(self.Tsn.strip("|").split("|")[1:]) ### seqid|cazy1|cazy2|...| ## not subfamily + self.UniReadId = lines[0].split(".")[0] + def __str__(self): + return "\t".join([str(getattr(self, value)) for value in vars(self) if value != "CAZys"]) + +class Paf(object): + def __init__(self,filename): + self.records = [PafRecord(line.split()) for line in open(filename)] + def __iter__(self): + return iter(self.records) + ### get reads id + def GetReadId(self): + return [record.Qsn for record in self] + ### get protein id + def GetSeqId(self): + return [record.SeqID for record in self] + ### get protein id: protein length dictory + def GetSeqLen(self): + return {record.SeqID:record.Tsl for record in self} + ### get CAZy family id 2 protein id: one-many + def CAZy2SeqID(self,CazySeqId): + for record in self: + for cazy in record.CAZys: + CazySeqId.setdefault(cazy,[]).append(record.SeqID) + ## get protein id 2 read is: one-many + def SeqID2ReadID(self,aa): + for record in self: + aa.setdefault(record.SeqID,[]).append(record.Qsn) + def ReadID2Record(self): + return {record.Qsn:record for record in self} + def Output(self): + [print (record) for record in self] + ## the CAZy information for megahit are not Qsn instead of they are in the + def Assign_CAZy_megahit(self): + for cazy in self: + cazy.CAZys = CAZy_filter(cazy.Qsn.strip("|").split("|")[1:]) + def Assign_subfam(self,CAZyID2subfam): + for hit in self: + hit.subfams = CAZyID2subfam.get(hit.Tsn,"") + def Get_subfam2SeqID(self,subfam2SeqID): + for record in self: + for cazy in record.subfams: + subfam2SeqID.setdefault(cazy,[]).append(record.SeqID) + +def CAZyReadCount(cazyid,cazy2seqid,readtable): + tmp_sum = 0 + for seqid in cazy2seqid[cazyid]: + tmp_sum += readtable[seqid] + return tmp_sum + +def FPKMToCsv(args,tool,cazyfpkm,readtable,cazy2seqid): + outfilename = args.output + with open(outfilename,'w') as f: + f.write(f"Family\tAbundance\tSeqNum\tReadCount\n") + for cazyid in cazyfpkm: + seqnum = len(cazy2seqid[cazyid]) + readcount = CAZyReadCount(cazyid,cazy2seqid,readtable) + fpkm = cazyfpkm[cazyid] + if not cazyid[0].isdigit(): + f.write(f"{cazyid}\t{fpkm}\t{seqnum}\t{readcount}\n") + +def check_read_type(filename): + if filename.endswith("fq") or filename.endswith("fq.gz"): + return "fq" + elif filename.endswith("fa") or filename.endswith("fa.gz"): + return "fa" + else: + sys.stderr.write(HLError("Error") + " File type not supported, please provide .fa(fa.gz) or (fq)fq.gz reads file.\n") + exit(1) + +def get_count_reads(file): + if file.endswith("fq.gz"): + r = os.popen("zcat " + file + " | echo $((`wc -l`/4))") + elif filename.endswith(".fq"): + r = os.popen("cat " + file + " | echo $((`wc -l`/4))") + elif file.endswith("fa.gz"): + r = os.popen("zcat " + file + " | grep '>' " + " | wc -l") + elif filename.endswith(".fa"): + r = os.popen("grep '>' " + file + " | wc -l") + text = r.read() + r.close() + return float(text) + +### need to modify codes for single-end sequencing + +def diamond_unassemble_data(args): + check_read_type(args.raw_reads) + paf1 = Paf(args.paf1) + if args.paf2: + paf2 = Paf(args.paf2) + totalreadnumber = get_count_reads(args.raw_reads) + if args.paf2: + totalreadnumber = float(totalreadnumber)*2 + ### FPKM or TPM is based on args.normalized + cazyfpkm,readtable,cazy2seqid = Cal_FPKM(paf1,paf2,totalreadnumber,args.normalized) + FPKMToCsv(args,"Diamond",cazyfpkm,readtable,cazy2seqid) + +def diamond_filter(args): + print_seqids = {} + for line in open(args.paf1): + lines = line.split() + if lines[0] not in print_seqids: + print(line.rstrip("\n")) + print_seqids[lines[0]] = 1 + +def getSeqlen(paf1,paf2): + x = paf1.GetSeqLen() + y = paf2.GetSeqLen() + return merge_two_dicts(x,y) + +def getCazySeqId(paf1,paf2): + cazy2seqid = {} + paf1.CAZy2SeqID(cazy2seqid) + paf2.CAZy2SeqID(cazy2seqid) + for cazy in cazy2seqid: + cazy2seqid[cazy] = set(cazy2seqid[cazy]) + return cazy2seqid + +def get_subfam2seqid(paf1,paf2): + subfam2seqid = {} + paf1.Get_subfam2SeqID(subfam2seqid) + paf2.Get_subfam2SeqID(subfam2seqid) + for subfam in subfam2seqid: + subfam2seqid[subfam] = set(subfam2seqid[subfam]) + return subfam2seqid + +def getSeqReadID(paf1,paf2): + seqid2readid = {} + paf1.SeqID2ReadID(seqid2readid) + paf2.SeqID2ReadID(seqid2readid) + return seqid2readid + +def SeqReadCount(seqid2readid): + ## 0.5 two reads should be one because of the input is pair end + return{seqid:len(seqid2readid[seqid]) for seqid in seqid2readid} + +def merge_two_dicts(x, y): + z = x.copy() + z.update(y) + return z + +def SequenceFPKM(readtable,seq2len,totalreadnumber): + seqfpkm = {} + for seqid in readtable: + tmp_total_read = float(totalreadnumber)/pow(10,6) + tmp_trans_len = float(seq2len[seqid])/1000 + read_count = float(readtable[seqid]) + tmp_fpkm = read_count/tmp_total_read/tmp_trans_len + #print(seqid,totalreadnumber,seq2len[seqid],read_count) + seqfpkm[seqid] = tmp_fpkm + return seqfpkm + +### Ni/Li*10^6 +### TPM = ------------------------------ +### sum(N1/L1+N2/L2 + ... + Nn/Ln) + +### but the TPM cann't not applied in assembly-free, because it will increase the value +### very limit reads can align to CAZyme +def SequenceTPM(readtable,seq2len,totalreadnumber): + seqtpm = {} + normalized_tpm = 0. + ### calculate normalized_tpm + for seqid in readtable: + read_count = float(readtable[seqid]) + seqlen = float(seq2len[seqid]) + normalized_tpm += read_count/seqlen + ### calculate tpm + for seqid in readtable: + read_count = float(readtable[seqid]) + seqlen = float(seq2len[seqid]) + normalized_reads_counts = read_count/seqlen*pow(10,6) + tmp_seqtpm = normalized_reads_counts/normalized_tpm + seqtpm[seqid] = tmp_seqtpm + return seqtpm + +### read count *10^6 +### RPM = ------------------------ +### total read count(mapped) + +def SequenceRPM(readtable,seq2len,totalreadnumber): + seqrpm = {} + for seqid in readtable: + read_count = float(readtable[seqid]) + seqlen = float(seq2len[seqid]) + rpm = read_count*pow(10,6)/totalreadnumber + seqrpm[seqid] = rpm + return seqrpm + +def CAZyFPKM(seqfpkm,cazy2seqid): ## named as FPKM, but can apply in tpm + cazyfpkm = {} + for cazy in cazy2seqid: + tmp_fpkm = 0. + for seqid in cazy2seqid[cazy]: + tmp_fpkm += float(seqfpkm[seqid]) + cazyfpkm[cazy] = tmp_fpkm + return cazyfpkm + +def Cal_FPKM(paf1,paf2,totalreadnumber,normalized): + ## get sequence length from paf + seq2len = getSeqlen(paf1,paf2) + + # get CAZy family to seq mapping table: CAZy ID 2 protein ID + cazy2seqid = getCazySeqId(paf1,paf2) + # outdict_list(cazy2seqid) + + ## get SeqID2ReadID to generate mapping table: protein ID 2 read ID + seqid2readid = getSeqReadID(paf1,paf2) + ### read table: protein ID 2 read count + readtable = SeqReadCount(seqid2readid) + ## outdict(readtable) + ## calculate fpkm for each protein seq + if normalized == "FPKM": + seqfpkm = SequenceFPKM(readtable,seq2len,totalreadnumber) + elif normalized == "RPM": + seqfpkm = SequenceRPM(readtable,seq2len,totalreadnumber) + else: + seqfpkm = SequenceTPM(readtable,seq2len,totalreadnumber) + ## outdict(seqfpkm) + cazyfpkm = CAZyFPKM(seqfpkm,cazy2seqid) + #outdict(cazyfpkm) + return cazyfpkm,readtable,cazy2seqid + +import argparse + +## dict: str -> [] + +def read_EC2substrate_table(args): + famEC2substrate = {} + map_table = f"{args.db}fam-substrate-mapping.tsv" + map_table_lines = open(map_table).readlines() + for line in map_table_lines[1:]: + lines = line.rstrip("\n").split("\t") + substrates = [sub_tmp.strip(" ") for sub_tmp in lines[0].strip().replace("and","").split(',')] + #famEC2substrate.setdefault(lines[2],[]).extend(substrates) + famEC2substrate.setdefault(lines[-1],[]).extend(substrates) + #famEC2substrate[lines[-1]] = lines[0] + for fam in famEC2substrate: + famEC2substrate[fam] = list(set(famEC2substrate[fam])) + return famEC2substrate + +### each protein may includes more than 1 eCAMI subfam +def read_CAZyID2subfam_table(args): + CAZyID2subfam = {} + map_table = f"{args.db}CAZyID_subfam_mapping.tsv" + map_table_lines = open(map_table).readlines() + for line in map_table_lines: + lines = line.rstrip("\n").split("\t") + CAZyID2subfam.setdefault(lines[-1],[]).append(lines[0]) + return CAZyID2subfam + +def read_subfam2ECosub_table(args): + subfam2EC = {};subfam2subtrate = {} + map_table = f"{args.db}subfam_EC_mapping.tsv" + map_table_lines = open(map_table).readlines() + for line in map_table_lines: + lines = line.rstrip("\n").split("\t") + if lines[-1] != "-": + substrates = [sub.strip() for sub in lines[-1].strip().replace("and","").split(",")] + subfam2subtrate.setdefault(lines[0],[]).extend(substrates) + if lines[1] != "-": + subfam2EC.setdefault(lines[0],[]).append(lines[1]) + + for subfam in subfam2EC: + subfam2EC[subfam] = list(set(subfam2EC[subfam])) + for subfam in subfam2subtrate: + subfam2subtrate[subfam] = list(set(subfam2subtrate[subfam])) + + ### dict, sub -> [] + return subfam2EC,subfam2subtrate + + +def diamond_EC_abund(args): + if not args.db.endswith("/"): + args.db += "/" + subfam2EC,subfam2subtrate = read_subfam2ECosub_table(args) + + EC2Abund = {} ; EC2subfam = {} + for line in open(args.input): + subfam,FPKM,ReadCount,SeqNum = line.rstrip("\n").split("\t") + if subfam in subfam2EC: + ECs = subfam2EC[subfam] + for EC in ECs: + subfams = EC2subfam.get(EC,[]) + if subfam not in subfams: + EC2subfam.setdefault(EC,[]).append(subfam) + EC2Abund.setdefault(EC,[]).append(float(FPKM)) + + outfilename = args.output + with open(outfilename,'w') as f: + f.write("EC\tAbundance\tsubfam\n") + for sub in EC2Abund: + f.write(sub+"\t"+str(sum(EC2Abund[sub]))+"\t"+";".join(EC2subfam[sub])+"\n") + +def CAZyme_substrate(args): + if not args.db.endswith("/"): + args.db += "/" + + EC2substrate = read_EC2substrate_table(args) + subfam2EC,subfam2subtrate = read_subfam2ECosub_table(args) + + Sub2Abund = {}; Sub2subfam = {} + for line in open(args.input): + #subfam,FPKM,ReadCount,SeqNum = line.rstrip("\n").split("\t") + #Subfamily Abundance SeqNum ReadCount + subfam,FPKM,SeqNum,ReadCount = line.rstrip("\n").split("\t") + ### route1, subfam->EC->substrate + if subfam in subfam2EC: + ECs = subfam2EC[subfam] + if ECs: + for EC in ECs: + substrates = EC2substrate.get(EC,"") + if substrates: + for sub in substrates: + subfams = Sub2subfam.get(sub,[]) + if subfam not in subfams: + Sub2Abund.setdefault(sub,[]).append(float(FPKM)) + Sub2subfam.setdefault(sub,[]).append(subfam) + ### route2, subfam -> substrate + substrates = subfam2subtrate.get(subfam,"") + if substrates: + for sub in substrates: + subfams = Sub2subfam.get(sub,[]) + if subfam not in subfams: + Sub2Abund.setdefault(sub,[]).append(float(FPKM)) + Sub2subfam.setdefault(sub,[]).append(subfam) + + outfilename = args.output + with open(outfilename,'w') as f: + f.write("Substrate\tAbundance\tsubfam\n") + for sub in Sub2Abund: + f.write(sub+"\t"+str(sum(Sub2Abund[sub]))+"\t"+";".join(Sub2subfam[sub])+"\n") + +def Cal_subfam_FPKM(paf1,paf2,totalreadnumber,normalized): + ## get sequence length from paf + seq2len = getSeqlen(paf1,paf2) + + # get CAZy family to seq mapping table: CAZy ID 2 protein ID + #cazy2seqid = getCazySeqId(paf1,paf2) + subfam2seqid = get_subfam2seqid(paf1,paf2) + # outdict_list(cazy2seqid) + + ## get SeqID2ReadID to generate mapping table: protein ID 2 read ID + seqid2readid = getSeqReadID(paf1,paf2) + ### read table: protein ID 2 read count + readtable = SeqReadCount(seqid2readid) + ## outdict(readtable) + ## calculate fpkm for each protein seq + if normalized == "FPKM": + seqfpkm = SequenceFPKM(readtable,seq2len,totalreadnumber) + elif normalized == "RPM": + seqfpkm = SequenceRPM(readtable,seq2len,totalreadnumber) + else: + seqfpkm = SequenceTPM(readtable,seq2len,totalreadnumber) + ## outdict(seqfpkm) + cazyfpkm = CAZyFPKM(seqfpkm,subfam2seqid) + #outdict(cazyfpkm) + return cazyfpkm,readtable,subfam2seqid + + +def diamond_subfam_abund(args): + if not args.db.endswith("/"): + args.db += "/" + check_read_type(args.raw_reads) + ### FPKM or TPM is based on args.normalized + CAZyID2subfam = read_CAZyID2subfam_table(args) + paf1 = Paf(args.paf1) + if args.paf2: + paf2 = Paf(args.paf2) + paf1.Assign_subfam(CAZyID2subfam) + paf2.Assign_subfam(CAZyID2subfam) + totalreadnumber = get_count_reads(args.raw_reads) + if args.paf2: + totalreadnumber = float(totalreadnumber)*2 + + subfamfpkm,readtable,subfam2seqid = Cal_subfam_FPKM(paf1,paf2,totalreadnumber,args.normalized) + FPKMToCsv(args,"Diamond",subfamfpkm,readtable,subfam2seqid) + +def arg_parse(): + parser = argparse.ArgumentParser(description='diamond assembly-free method') + parser.add_argument('function', help='which function will be used.',choices=["diamond_fam_abund","diamond_substrate_abund","diamond_subfam_abund","diamond_EC_abund"]) + parser.add_argument('-paf1', type=str, help='R1 reads diamond blastx',default="") + parser.add_argument('-paf2', type=str, help='R2 reads diamond blastx',default="") + parser.add_argument('-i','--input', type=str) + parser.add_argument('-d','--db', type=str,default="./db") + parser.add_argument('-o','--output', type=str ,default="asmfree_fam_abund") + parser.add_argument('--raw_reads', type=str ,default=" ",help="compress or uncompress fq/fa type of reads.") + parser.add_argument('-n','--normalized', type=str ,help="FPKM, TPM, RPM",default = "TPM",choices=['FPKM', 'RPM', 'TPM']) + return parser.parse_args() + +def main(): + args = arg_parse() + if args.function == "diamond_fam_abund": + ### dbcan_asmfree diamond_fam_abund -paf1 Dry2014_1.blastx -paf2 Dry2014_2.blastx --raw_reads Dry2014_1_val_1.fq.gz -n FPKM -o Dry2014_fam_abund + ### dbcan_asmfree diamond_fam_abund -paf1 Wet2014_1.blastx -paf2 Wet2014_2.blastx --raw_reads Wet2014_1_val_1.fq.gz -n FPKM -o Wet2014_fam_abund + diamond_unassemble_data(args) + if args.function == "diamond_subfam_abund": + ### dbcan_asmfree diamond_subfam_abund -paf1 Dry2014_1.blastx -paf2 Dry2014_2.blastx --raw_reads Dry2014_1_val_1.fq.gz -o Dry2014_subfam_abund -n FPKM + ### dbcan_asmfree diamond_subfam_abund -paf1 Wet2014_1.blastx -paf2 Wet2014_2.blastx --raw_reads Wet2014_1_val_1.fq.gz -o Wet2014_subfam_abund -n FPKM + diamond_subfam_abund(args) + if args.function == "diamond_EC_abund": + ### dbcan_asmfree diamond_EC_abund -i Dry2014_subfam_abund -o Dry2014_EC_abund + ### dbcan_asmfree diamond_EC_abund -i Wet2014_subfam_abund -o Wet2014_EC_abund + diamond_EC_abund(args) + if args.function == "diamond_substrate_abund": + ### dbcan_asmfree diamond_substrate_abund -i Dry2014_subfam_abund -o Dry2014_substrate_abund + ### dbcan_asmfree diamond_substrate_abund -i Wet2014_subfam_abund -o Wet2014_substrate_abund + CAZyme_substrate(args) +if __name__== "__main__": + main() diff --git a/dbcan/utils/plots.py b/dbcan/utils/plots.py new file mode 100644 index 000000000..422a512cf --- /dev/null +++ b/dbcan/utils/plots.py @@ -0,0 +1,717 @@ +import time +from subprocess import Popen, call, check_output +import argparse,os +import numpy as np +import pandas as pd +import matplotlib.pyplot as plt +from matplotlib.path import Path +from matplotlib.patches import PathPatch +import matplotlib.patches as patches +from matplotlib.patches import FancyArrowPatch +from matplotlib.patches import Polygon +from matplotlib.lines import Line2D +import seaborn as sns +from matplotlib import pyplot +from matplotlib.patches import Patch +#plt.style.use('seaborn') +from dbcan.utils.utils import cgc_standard_line +from dbcan_cli.syntenic_plot import syntenic_plot,read_blast_result_cgc,read_UHGG_CGC_stanrdard_out,read_PUL_cgcgff +from dbcan_cli.syntenic_plot import Get_parameters_for_plot,plot_Polygon_homologous,plot_syntenic_block +from dbcan_cli.syntenic_plot import Get_Position as synGet_Position +from dbcan_cli.syntenic_plot import plot_genome_line as synplot_genome_line +import matplotlib as mpl +mpl.rcParams['pdf.fonttype'] = 42 +mpl.rcParams['ps.fonttype'] = 42 +import matplotlib.colors as colors + +class CGC_Standard_Out(object): + def __init__(self,filename): + hits = open(filename).readlines()[1:] + self.genes = [] + for line in hits: + if line.startswith("CGC#"): + continue + lines = line.split() + self.genes.append(cgc_standard_line(lines)) + def __iter__(self): + return iter(self.genes) + + def CGCID2genes(self): + cgcdict = {} + for gene in self: + cgcdict.setdefault(gene.cgcid,[]).append(gene) + return cgcdict + +class CGC(object): + def __init__(self,genes): + self.genes = genes + self.ID = genes[0].cgcid ### get cgc id + self.start = min([gene.gene_start for gene in genes]) + self.end = max([gene.gene_end for gene in genes]) + self.gene_num = len(genes) + + def __iter__(self): + return iter(self.genes) + def __repr__(self): + return "\t".join([self.ID,str(self.start),str(self.end),str(self.gene_num)]) + def __len__(self): + return len(self.genes) + + def get_positions(self): + starts = [] ; ends = [] ; strands = [] + for gene in self: + starts.append(gene.gene_start) + ends.append(gene.gene_end) + strands.append(gene.strand) + return starts,ends,strands + + def get_proteinID(self): + return [gene.seqid for gene in self] + def get_cgc_CAZyme(self): + return [gene.gene_type for gene in self] + +class CGC_standard_out_2CGC(object): + def __init__(self,dbcan): + self.CGCs = [] + cgcdict = dbcan.CGCID2genes() + for cgc in cgcdict: + self.CGCs.append(CGC(cgcdict[cgc])) + def __iter__(self): + return iter(self.CGCs) + + def cgcid2CGC(self): + return {cgc.ID:cgc for cgc in self} + +def CGC_plot(args): + paras = plot_parameters(args) + dbCAN_standard_out = CGC_Standard_Out(paras.PUL_annotation) + cgcs = CGC_standard_out_2CGC(dbCAN_standard_out) + cgcid2cgc = cgcs.cgcid2CGC() + cgc = cgcid2cgc[args.cgcid] + genetypes = cgc.get_proteinID() + ## get gene starts + starts,ends,strands = cgc.get_positions() + types = cgc.get_cgc_CAZyme() + print(f"{args.cgcid.split('|')[0]}:{min(starts)}-{max(ends)}") + cgc_fig_plot(starts,ends,strands,types,genetypes) + +def read_location_reads_count(filename): + xs2ys = {} + for line in open(filename): + lines = line.split() + xs2ys[int(lines[1])] = int(lines[2]) + return xs2ys + +def CGC_plot_reads_count(args): + paras = plot_parameters(args) + dbCAN_standard_out = CGC_Standard_Out(paras.PUL_annotation) + cgcs = CGC_standard_out_2CGC(dbCAN_standard_out) + cgcid2cgc = cgcs.cgcid2CGC() + cgc = cgcid2cgc[args.cgcid] + genetypes = cgc.get_proteinID() + ## get gene starts + starts,ends,strands = cgc.get_positions() + types = cgc.get_cgc_CAZyme() + cgc_fig_plot_abund(starts,ends,strands,types,genetypes,paras) + +def Get_Position(starts,ends,strands,labels,yshift=0): + Width = 1000 ; Height = 160; + poly_heigth = 10 + Triangle_length = 4 + plot_start_x, plot_start_y = [0,Height/2 - poly_heigth-yshift] + shfit_pos = starts[0] + maxbp = max(ends) - min(starts) + pixeachbp = Width / maxbp + for i in range(len(starts)): + starts[i] = starts[i] - shfit_pos + ends[i] = ends[i] - shfit_pos + #maxbp = max(ends) - min(starts) + ### 5 4 + ### 3 + ### 1 2 + lines = [] ;polygens = [];texts = [] + for i in range(len(starts)): + if strands[i] == "+": + positions_str = str( starts[i] * pixeachbp) + " " + str(plot_start_y) + " " ## first point x,y + positions_str += str( ends[i] * pixeachbp - Triangle_length) + " " + str(plot_start_y) + " "## second point + positions_str += str( ends[i] * pixeachbp) + " " + str(plot_start_y + poly_heigth) + " " ## 3 + positions_str += str( ends[i] * pixeachbp - Triangle_length) + " " + str( plot_start_y + 2*poly_heigth) + " " ### 4 + positions_str += str( starts[i] * pixeachbp )+ " " + str(plot_start_y + 2*poly_heigth) + if strands[i] == "-": + positions_str = str( starts[i] * pixeachbp ) + " " + str(plot_start_y + poly_heigth) + " " + positions_str += str( starts[i] * pixeachbp + Triangle_length) + " " + str(plot_start_y) + " " + positions_str += str(ends[i] * pixeachbp) + " " + str(plot_start_y) + " " + positions_str += str( ends[i] *pixeachbp ) + " " + str(plot_start_y + 2* poly_heigth) + " " + positions_str += str( starts[i]* pixeachbp +Triangle_length) + " " + str(plot_start_y + 2* poly_heigth) + polygens.append(positions_str) + ### for genome line + if i < len(starts) -1: + positions_str = str( ends[i] *pixeachbp) + " " + str(plot_start_y + poly_heigth) + " " + positions_str += str( starts[i+1]*pixeachbp) + " " + str(plot_start_y + poly_heigth) + lines.append(positions_str) + texts.append(labels[i].split('.')[0]) + + scale_number = 10 + each_scale_bp = maxbp / scale_number + each_scale_pix = each_scale_bp * pixeachbp + + plot_start_y -= 50 + scale_positions = []; scale_positions_texts = [] ; scale_text = [] + scale_positions.append("0 " + str(plot_start_y + 3*poly_heigth) + " " + str(10*each_scale_pix) + " " + str(plot_start_y + 3*poly_heigth)) + plot_start_y -= 1 + for i in range(scale_number+1): + positions_str = str(i*each_scale_pix) + " " + positions_str += str(plot_start_y + 3* poly_heigth) + " " + positions_str += str(i*each_scale_pix) + " " + positions_str += str(plot_start_y + 3*poly_heigth + 0.6* poly_heigth) + scale_positions.append(positions_str) + positions_str = str(i*each_scale_pix) + " " + str(plot_start_y + 3*poly_heigth + 0.6* poly_heigth) + scale_positions_texts.append(positions_str) + scale_text.append(str(int(each_scale_bp*i)+ shfit_pos)) + + return polygens,lines,texts,scale_positions,scale_text + +def plot_Polygon(polygens1,types1,ax): + colors_map = {"CAZyme":"#FF0000","null":"#808080","other":"#808080", + "TC":"#9400D3","CDS":"#00FFFF","STP":"#0000FF","TF":"#1E90FF"} + for j in range(len(polygens1)): + polygen = polygens1[j].split() + points = [] + color = colors_map[types1[j]] + for i in range(int(len(polygen)/2)): + points.append([float(polygen[2*i]),float(polygen[2*i+1])]) + ax.add_patch(Polygon(points, color=color, alpha=0.5,edgecolor=None,facecolor=None,lw=0)) + +def plot_genome_line(lines,ax): + for line in lines: + x1,y1,x2,y2 = points2(line) + ax.add_patch(Polygon([(x1,y1),(x2,y2)], color="gray",lw=1,edgecolor=None)) + +def plot_scale_line(lines,label,ax): + for i,line in enumerate(lines): + x1,y1,x2,y2 = points2(line) + ax.add_patch(Polygon([(x1,y1),(x2,y2)], color="gray",lw=1,edgecolor=None)) + if i>=1: + ax.text(float(x1),float(y1)-20,label[i-1],va='bottom', ha='center') + +def points2(coord): + x1,y1,x2,y2 = coord.split() + return x1,y1,x2,y2 + +def cgc_fig_plot(starts,ends,strands,types,labels): + custom_lines = [Line2D([0], [0], color="red", lw=4,alpha=0.5), + Line2D([0], [0], color="blue", lw=4,alpha=0.5), + Line2D([0], [0], color="green", lw=4,alpha=0.5), + Line2D([0], [0], color="cyan", lw=4,alpha=0.5), + Line2D([0], [0], color="gray", lw=4,alpha=0.5)] + + labelcolor=["red","blue","green","cyan","gray"] + + genecustom_lines = [Patch(color="#FF0000",alpha=0.5), + Patch(color="#808080", alpha=0.5), + Patch(color="#9400D3", alpha=0.5), + Patch(color="#0000FF", alpha=0.5), + Patch(color="#1E90FF", alpha=0.5)] + genelabelcolor=["#FF0000","#808080","#9400D3","#0000FF","#1E90FF"] + geneslabels = ["CAZyme","Other","TC","STP","TF"] + ### for legends + px = 1/plt.rcParams['figure.dpi'] ## px + Width = 1400 ; Height = 100 + fig = plt.figure(figsize=(Width*px*1.2,Height*px*2)) + ax = fig.add_subplot(111) + maxbp = max(ends) - min(starts) + polygens,lines,texts,scale_positions,scale_text = Get_Position(starts,ends,strands,labels) + #print (texts,scale_positions,scale_text) + plot_Polygon(polygens,types,ax) + plot_genome_line(lines,ax) + plot_scale_line(scale_positions,scale_text,ax) + ax.plot() + legend = pyplot.legend(genecustom_lines,geneslabels,frameon=False,labelcolor=genelabelcolor,loc='best',title_fontsize="x-large") + ax.add_artist(legend) + plt.ylim(0,150) + plt.xlim(-50,1100) + plt.tight_layout(pad=0.1) + plt.axis('off') + #plt.show() + file_name = "cgc.pdf" + print(f"Save figure to file {file_name}!") + plt.savefig(f"{file_name}") + plt.close() + +def cgc_fig_plot_abund(starts,ends,strands,types,labels,parameters): + ori_starts = starts.copy(); ori_ends = ends.copy() #### starts will be shift, kept them + custom_lines = [Line2D([0], [0], color="red", lw=4,alpha=0.5), + Line2D([0], [0], color="blue", lw=4,alpha=0.5), + Line2D([0], [0], color="green", lw=4,alpha=0.5), + Line2D([0], [0], color="cyan", lw=4,alpha=0.5), + Line2D([0], [0], color="gray", lw=4,alpha=0.5)] + + labelcolor=["red","blue","green","cyan","gray"] + + genecustom_lines = [Patch(color="#FF0000",alpha=0.5,lw=0), + Patch(color="#808080", alpha=0.5,lw=0), + Patch(color="#9400D3", alpha=0.5,lw=0), + Patch(color="#0000FF", alpha=0.5,lw=0), + Patch(color="#1E90FF", alpha=0.5,lw=0)] + genelabelcolor=["#FF0000","#808080","#9400D3","#0000FF","#1E90FF"] + geneslabels = ["CAZyme","Other","TC","STP","TF"] + ### for legends + px = 1/plt.rcParams['figure.dpi'] ## px + Width = 1400 ; Height = 100 + fig = plt.figure(figsize=(Width*px*1.2,Height*px*4)) + + #plt.subplots_adjust(bottom=-0.5) + ax = fig.add_subplot(212) + maxbp = max(ends) - min(starts) + polygens,lines,texts,scale_positions,scale_text = Get_Position(starts,ends,strands,labels) + #print (texts,scale_positions,scale_text) + plot_Polygon(polygens,types,ax) + plot_genome_line(lines,ax) + plot_scale_line(scale_positions,scale_text,ax) + ax.plot() + legend = pyplot.legend(genecustom_lines,geneslabels,frameon=False,labelcolor=genelabelcolor,loc='best',title_fontsize="x-large") + ax.add_artist(legend) + plt.ylim(0,150) + xlim_x1,xlim_x2 = (-10,1100) + plt.xlim(xlim_x1,xlim_x2) + #plt.tight_layout(pad=0.1) + plt.axis('off') + + ### here we need to plot the reads_count of each position + ### layout 2 + xs2ys = read_location_reads_count(parameters.reads_count) + max_y = max(xs2ys.values()) + add_readcount_layout(fig,ori_starts,ori_ends,xs2ys,max_y,-3,max_y+10,xlim_x1,xlim_x2,maxbp) + #plt.show() + file_name = "cgc-coverage.pdf" + print(f"Save figure to file {file_name}!") + plt.savefig(f"{file_name}") + #plt.close() + +def add_readcount_layout(fig,starts,ends,xs2ys,max_y,ylim_y1,ylim_y2,xlim_x1,xlim_x2,syn_maxbp): + maxbp = max(ends) -min(starts) + Width = 1000 + pixeachbp = Width / syn_maxbp ### here the maxbp should from the whole max + ax = fig.add_subplot(211) + #ax = plt.axes([0, 0.4, 1, 0.3]) + plt.ylim(ylim_y1,ylim_y2) + plt.xlim(xlim_x1,xlim_x2) + plt.tight_layout(pad=0.1) + plt.plot((0,1000),(0,0),color='gray',lw=1) + all_xs = []; all_ys = [] + start = min(starts) + for i in range(1,maxbp+1): ### + all_xs.append(pixeachbp*i) + if i+start in xs2ys: + all_ys.append(xs2ys[i+start]) + else: + all_ys.append(0) + #print(starts,all_xs[0],all_xs[-1]) + plt.plot(all_xs,all_ys,'-',alpha=0.5,color='red',lw=1) + #ax.fill_between(all_xs,0*len(all_xs),all_ys,facecolor='red', alpha=0.3,edgecolor="white") + ax.fill_between(all_xs,all_ys,0,facecolor='red', alpha=0.3,edgecolor="white") + for pos in ['top', 'right', 'bottom']: + ax.spines[pos].set_visible(False) + ax.tick_params(bottom=False, top=False, left=True, right=False) + ax.set_xticks([]) + +class plot_parameters(): + def __init__(self,args): + self.input = args.input if args.input.endswith("/") else args.input +"/" + #self.R1 = args.R1 + #self.R2 = args.R2 + self.bedtools = args.bedtools + self.reads_count = args.readscount + self.output = args.function + "_" + args.output + self.CAZyme_annotation = self.input + "overview.txt" + self.dbCANsub_substrate_annotation = self.input + "dbcan-sub.hmm.out" + self.PUL_substrate_annotation = self.input + "substrate.out" + self.PUL_annotation = self.input + "cgc_standard.out" + self.function = args.function + self.parameters_check() + + def parameters_check(self): + if self.function == "CGC_plot": + print("You are plotting the CGC regarding abundance!") + if not os.path.exists(self.PUL_annotation): + print(f"PUL annotation file {self.PUL_annotation} dose not exit, please check run_dbcan finish status!") + exit(1) + if self.function == "CGC_coverage_plot": + print("You are plotting the CGC with reads count!") + if not os.path.exists(self.PUL_annotation): + print(f"PUL annotation file {self.PUL_annotation} dose not exit, please check run_dbcan finish status!") + exit(1) + if not os.path.exists(self.reads_count): + print(f"Reads count file {self.reads_count} dose not exit, please run samtools depth first!") + exit(1) + + if self.function == "CGC_synteny_plot": + self.blastp = self.input + "PUL_blast.out" + if self.function == "CGC_synteny_coverage_plot": + self.blastp = self.input + "PUL_blast.out" + +def generate_syntenic_block(cgcpul,cgcpul_blastp,genes1,genes2): + blocks = [] + for record in cgcpul_blastp[cgcpul]: ### generate block information + query = record.qseqid + hit = record.sseqid + cgc_proteinid = query.split("|")[2] + pul_proteinid = hit.split(":")[3] + if not pul_proteinid: + pul_proteinid = hit.split(":")[2] + try: + index1 = genes1.index(cgc_proteinid) + index2 = genes2.index(pul_proteinid) + blocks.append(f"{index1}-{index2}-{record.pident}") + except: + print (cgcpul,query,hit,cgc_proteinid,pul_proteinid,genes1,genes2) + continue + return blocks + #print (cgc_proteinid2gene[cgc_proteinid],pul_proteinid2gene[pul_proteinid]) + +def CGC_syntenic_with_PUL(args): + paras = plot_parameters(args) + cgcid2pulid = {line.rstrip().split("\t")[0]:line.rstrip().split("\t")[1] for line in open(paras.PUL_substrate_annotation).readlines()[1:]} + cgc = args.cgcid + pul = cgcid2pulid.get(cgc,"") + if pul: + cgcpul_blastp = read_blast_result_cgc(paras.blastp) + cgc_proteinid2gene,cgcid2gene,cgcid2geneid = read_UHGG_CGC_stanrdard_out(paras.PUL_annotation) + PULid_proteinid2gene,PULid2gene,PULid2geneid = read_PUL_cgcgff(args) ### read db_dir + cgcpul = cgc+":"+pul + bed_cgc = cgcid2gene[cgc] + bed_pul = PULid2gene[pul] + starts1,ends1,strands1,types1 = Get_parameters_for_plot(bed_cgc) + starts2,ends2,strands2,types2 = Get_parameters_for_plot(bed_pul) + genes1 = cgcid2geneid[cgc] + genes2 = PULid2geneid[pul] + #print (cgc,pul) + #print (genes1) + #print (genes2) + blocks = generate_syntenic_block(cgcpul,cgcpul_blastp,genes1,genes2) + syntenic_plot(starts1,starts2,ends1,ends2,strands1,strands2,types1,types2,blocks,cgc,pul) + else: + print(f"Does not find homolog PUL for CGC: {cgc}!") + exit(1) + +def CGC_syntenic_with_PUL_abund(args): + paras = plot_parameters(args) + cgcid2pulid = {line.rstrip().split("\t")[0]:line.rstrip().split("\t")[1] for line in open(paras.PUL_substrate_annotation).readlines()[1:]} + cgc = args.cgcid + pul = cgcid2pulid.get(cgc,"") + if pul: + cgcpul_blastp = read_blast_result_cgc(paras.blastp) + cgc_proteinid2gene,cgcid2gene,cgcid2geneid = read_UHGG_CGC_stanrdard_out(paras.PUL_annotation) + PULid_proteinid2gene,PULid2gene,PULid2geneid = read_PUL_cgcgff(args) ### read db_dir + cgcpul = cgc+":"+pul + bed_cgc = cgcid2gene[cgc] + bed_pul = PULid2gene[pul] + starts1,ends1,strands1,types1 = Get_parameters_for_plot(bed_cgc) + starts2,ends2,strands2,types2 = Get_parameters_for_plot(bed_pul) + genes1 = cgcid2geneid[cgc] + genes2 = PULid2geneid[pul] + #print (cgc,pul) + #print (genes1) + #print (genes2) + blocks = generate_syntenic_block(cgcpul,cgcpul_blastp,genes1,genes2) + syntenic_plot_with_abund(starts1,starts2,ends1,ends2,strands1,strands2,types1,types2,blocks,cgc,pul,paras) + else: + print(f"Does not find homolog PUL for CGC: {cgc}!") + exit(1) + +def syntenic_plot_with_abund(starts,starts1,ends,ends1,strands,strands1,Types,Types1,blocks,cgcid,pulid,paras): + ### for legends + custom_lines = [Line2D([0], [0], color="red", lw=4,alpha=0.5), + Line2D([0], [0], color="blue", lw=4,alpha=0.5), + Line2D([0], [0], color="green", lw=4,alpha=0.5), + Line2D([0], [0], color="cyan", lw=4,alpha=0.5), + Line2D([0], [0], color="gray", lw=4,alpha=0.5)] + #print(starts,starts1,ends,ends1,strands,strands1,Types,Types1,blocks,cgcid,pulid) + ### syntenic block colors + labelcolor=["red","blue","green","cyan"] + labels = ["80-100","60-80","40-60","20-40"] + genecustom_lines = [Patch(color="#FF0000",alpha=0.5), + Patch(color="#808080", alpha=0.5), + Patch(color="#9400D3", alpha=0.5), + Patch(color="#0000FF", alpha=0.5), + Patch(color="#1E90FF", alpha=0.5)] + genelabelcolor=["#FF0000","#808080","#9400D3","#0000FF","#1E90FF"] + geneslabels = ["CAZyme","Other","TC","STP","TF"] + + ### for legends + + px = 1/plt.rcParams['figure.dpi'] ## px + Width = 1600 ; Height = 320*2 + + fig = plt.figure(figsize=(Width*px,Height*px*2/2.5)) + ax = fig.add_subplot(212) + ### decide which + maxbp = max([max(ends) - min(starts),max(ends1) - min(starts1)]) + + ori_starts = starts.copy(); ori_ends = ends.copy() #### starts will be shift, kept them + + ### get postion for all elements of CGC + polygens,blocks_coor,lines_coor,scale_positions,scale_text = synGet_Position(starts,ends,strands,maxbp,yshift=40,up=2) ## CGC + plot_scale_line(scale_positions,scale_text,ax) + + polygens1,blocks1_coor,lines_coor1,_,_ = synGet_Position(starts1,ends1,strands1,maxbp,yshift=0,up=1) ### PUL + ### + plot_Polygon_homologous(polygens,polygens1,Types,Types1,2,ax) + ### + plot_syntenic_block(blocks,blocks_coor,blocks1_coor,ax) + synplot_genome_line(lines_coor,lines_coor1,ax) + ### need to add the genome postion scale + ### legend1 + legend1 = pyplot.legend(custom_lines,labels,frameon=False,labelcolor=labelcolor, + loc='upper right',title="Identity") + ax.add_artist(legend1) + + legend2 = pyplot.legend(genecustom_lines,geneslabels,frameon=False, + labelcolor=genelabelcolor,loc='lower right',title="Gene") + ax.add_artist(legend2) + + plt.text(500,10,cgcid,fontsize=10,horizontalalignment='center') + plt.text(500,90,pulid,fontsize=10,horizontalalignment='center') + xlim_x1,xlim_x2 = (-10,1100) + ylim_y1,ylim_y2 = (0,100) + plt.ylim(ylim_y1,ylim_y2) + plt.xlim(xlim_x1,xlim_x2) + plt.axis('off') + ax.plot() + #plt.tight_layout(pad=0.1) + cgcid = cgcid.replace("|","_") ### need to replace "|" to "_", because | is a special chara for system + ### for local + xs2ys = read_location_reads_count(paras.reads_count) + max_y = max(xs2ys.values()) + add_readcount_layout(fig,ori_starts,ori_ends,xs2ys,max_y,ylim_y1,max_y,xlim_x1,xlim_x2,maxbp) + #plt.show();exit() + print(f"Saving figure to file {cgcid}-syntenic-cov.pdf!") + plt.savefig(f"{cgcid}-syntenic-cov.pdf") + plt.close() + +def combined_datafram_based_on_first_col(pd_lists,samples): + if len(pd_lists) <= 1: + return pd_lists[0] + else: + col_name = pd_lists[0].columns + on_merge_col = col_name[0] ### + merged_table = pd.merge(pd_lists[0],pd_lists[1],on=[on_merge_col],how="outer") + + for i in range(len(pd_lists)): + ori_names = pd_lists[i].columns + mod_names = [ori_names[0]] + [ori_names[j] +"_"+ samples[i] for j in range(1,len(ori_names))] + pd_lists[i].columns = mod_names + + for i in range(2,len(pd_lists)): + merged_table = pd.merge(merged_table,pd_lists[i],on=[on_merge_col],how="outer") + #print(merged_table.columns) + abundance_col = col_name[1] + merged_table.fillna(0,inplace=True) + merged_table["diff_abs"] = np.abs(merged_table[abundance_col+"_x"] - merged_table[abundance_col+"_y"]) + merged_table["diff"] = merged_table[abundance_col+"_x"] - merged_table[abundance_col+"_y"] + + ### rename columns names + merged_columns = merged_table.columns + rename_columns = [] + abund_index = 0 + for column in merged_columns: + if abundance_col in column: + rename_columns.append(samples[abund_index]) + abund_index += 1 + else: + rename_columns.append(column) + #merged_table.rename(columns={abundance_col+"_x": samples[0], abundance_col+"_y": samples[1]},inplace=True) + merged_table.columns = rename_columns + merged_table.sort_values("diff_abs",inplace=True,ascending=False) + return merged_table,on_merge_col + +def filter_out_enzyme_number(table): + bools = [] + for i in table.iloc[:,0]: ### first col + if i[0].isdigit(): + bools.append(True) + elif i in ["PL0","GH0","GT0","CBM0","AA0","CE0"]: + bools.append(False) + else: + bools.append(True) + table = table[bools] + #print (table) + return table + +import re +def add_column_type(table): ### Like + cols = [] + for i in table["CAZy"]: + fam = re.sub(r'[0-9]+', '', i) + cols.append(fam) + #print (i,fam) + table["fam"] = cols + return table + +def heatmap_plot(args): + pds = [filter_out_enzyme_number(pd.read_csv(filename,sep="\t")) for filename in args.input.split(",")] + samples = args.samples.split(",") + plt.style.use(args.plot_style) + if len(pds) != len(samples): + print("The number of samples is not eaqul the abundance!") + exit(1) + for i in range(len(pds)): + pds[i]["sample"] = samples[i] + data,x = combined_datafram_based_on_first_col(pds,samples) + + if not args.col: + data = data.iloc[0:int(args.top),:] + else: + if args.value: + data = data.loc[data[args.col].isin(args.value.split(","))] + else: + data = data.iloc[0:int(args.top),:] + data = data.set_index(data.iloc[:,0]) + data = data[samples] + sns.set_style("whitegrid") + sns.set_context("paper") + ### user defined color map + + ### default color + if args.palette: + cmap = args.palette + else: + mycolor=['aliceblue','skyblue','deepskyblue','orange','tomato','red'] + cmap = colors.LinearSegmentedColormap.from_list('my_list', mycolor) + cmap.set_under('white') + + if args.cluster_map: + sns.clustermap(data, cmap=cmap,cbar=True,vmin=0.1,dendrogram_ratio=0.03,cbar_pos=(0.1, 1, 0.1, 0.1), + col_cluster=False,cbar_kws={"shrink": 0.3}, + figsize=(len(data.columns)*1.2,len(data.index)/3)) + #plt.tight_layout(pad=0.1) + plt.savefig("heatmap_cluster.pdf") + else: + plt.figure(figsize=(len(data.columns)*1.2,len(data.index)/4)) + if args.show_abund: + ax = sns.heatmap(data, cmap=cmap,yticklabels=True,annot=True,fmt=".0f",linewidths=.5,cbar=True,vmin=0.1, + cbar_kws={"shrink": 0.3,"anchor":(0, 0.0)}) + else: + ax = sns.heatmap(data, cmap=cmap,yticklabels=True,annot=args.show_abund,fmt=".0f",linewidths=0,cbar=True,vmin=0.1, + cbar_kws={"shrink": 0.3,"anchor":(0, 0.0)}) + ax.collections[0].colorbar.ax.tick_params(labelsize=6) + + plt.xticks(rotation=30) + plt.tight_layout(pad=0.1) + #plt.show() + plt.savefig("heatmap.pdf") + +def bar_plot(args): + ### --input CAZyme_abund_output,CAZyme_abund_output + ### --samples fefifo_8002_1,fefifo_8002_7 + ### show top10? or most different? + pds = [filter_out_enzyme_number(pd.read_csv(filename,sep="\t")) for filename in args.input.split(",")] + samples = args.samples.split(",") + plt.style.use(args.plot_style) + if len(pds) != len(samples): + print("The number of samples is not eaqul the abundance!") + exit(1) + for i in range(len(pds)): + pds[i]["sample"] = samples[i] + data,x = combined_datafram_based_on_first_col(pds,samples) + ### top different abundance --value 'host glycan,starch' --col Substrate + if not args.col: + data = data.iloc[0:int(args.top),:] + else: + if args.value: + data = data.loc[data[args.col].isin(args.value.split(","))] + else: + data = data.iloc[0:int(args.top),:] + ### normal + if args.vertical_bar: + ax = data.plot.barh(x=x, y=samples) + plt.ylabel("") + plt.xlabel("Abundance") + else: + ax = data.plot(x=x, y=samples, kind="bar") + plt.xticks(rotation=90) + plt.xlabel("") + plt.ylabel("Abundance") + + #plt.gca().invert_yaxis() + #plt.gca().invert_xaxis() + ### switch + + #leg = plt.legend(frameon=False,handletextpad=-2.0, handlelength=0) + #for item in leg.legendHandles: + # item.set_visible(False) + #axins = inset_axes(ax, "30%", "30%" ,loc="upper center", borderpad=1) + + plt.title(f"The most top{args.top} different families") + #plt.show() + plt.savefig("bar_plot.pdf") + print("Saving plot to file: bar_plot.pdf") + +def parse_argv(): + usage = ''' + %(prog)s [positional arguments] [options] + ----------------------------------------- + Plot CGC + dbcan_plot CGC_plot -i ../fefifo_8002_1.dbCAN --cgcid 'k141_145331|CGC1' + ----------------------------------------- + Plot CGC with abundance + dbcan_plot CGC_coverage_plot -i ../fefifo_8002_1.dbCAN --cgcid 'k141_145331|CGC1' --readscount cgc.depth.txt + ----------------------------------------- + Plot syntenic blocks between CGC and dbCAN-PUL + dbcan_plot CGC_synteny_plot -i ../fefifo_8002_1.dbCAN --cgcid 'k141_145331|CGC1' + ----------------------------------------- + Plot syntenic blocks between CGC and dbCAN-PUL with abundance + dbcan_plot CGC_synteny_coverage_plot -i ../fefifo_8002_1.dbCAN --cgcid 'k141_145331|CGC1' --readscount cgc.depth.txt + ----------------------------------------- + barplot of abundance for CAZyme,substate across samples + dbcan_plot bar_plot -i ../fefifo_8002_1.CAZyme_abund,../fefifo_8002_7.CAZyme_abund --samples fefifo_8002_1,fefifo_8002_7 --top 40 --vertical_bar + ----------------------------------------- + Heatmap of abundance for CAZyme,substate across samples + dbcan_plot heatmap_plot -i ../fefifo_8002_1.CAZyme_abund,../fefifo_8002_7.CAZyme_abund --samples fefifo_8002_1,fefifo_8002_7 --show_abund + ----------------------------------------- + ''' + parser = argparse.ArgumentParser(description='dbCAN plot ulities.',usage=usage,prog='dbcan_plot') + parser.add_argument('function', help='What function will be used to analyze?') + parser.add_argument('-i','--input',help='dbCAN CAZyme annotation oupput folder.',default="output",required=True) + parser.add_argument('-bt','--bedtools',help='bedtools gene reads count results.') + #parser.add_argument('-1','--R1',help='R1 reads, support gz compress file') + #parser.add_argument('-2','--R2',help='R2 reads, support gz compress file, None for single end sequencing',default=None) + parser.add_argument('-o','--output',help='output files',default="output") + parser.add_argument('--db_dir', default="db", help='Database directory') + parser.add_argument('--cgcid', help='CGC id, consists of contig_ID|cgc_order',type=str,default=None) + parser.add_argument('--readscount', help='Read counts file generated by samtools depth!') + parser.add_argument('--samples', help='samples seperated by ",".') + parser.add_argument('--top', help='Plot top num, the family or substrate was sorted by abs different.',default=20,type=int) + parser.add_argument('--plot_style', help='Style for barplot and heatmap.',choices=["ggplot",'seaborn','seaborn-poster'],default="ggplot") + parser.add_argument('--vertical_bar', help='vertical bar',action='store_true') + parser.add_argument('--show_abund', help='Show abundance in heatmap?',action='store_true') + parser.add_argument('--palette', help='palettes or colormaps defined in matplotlib',default=None) + parser.add_argument('--cluster_map', action='store_true') + parser.add_argument('--col',default=None) + parser.add_argument('--value',default=None) + args = parser.parse_args() + return args + + +def main(): + args = parse_argv() + if args.function == "CGC_plot": + ### dbcan_plot CGC_plot -i ../fefifo_8002_1.dbCAN --cgcid 'k141_145331|CGC1' + CGC_plot(args) + if args.function == "CGC_coverage_plot": + ### dbcan_plot CGC_coverage_plot -i ../fefifo_8002_1.dbCAN --cgcid 'k141_145331|CGC1' --readscount cgc.depth.txt + CGC_plot_reads_count(args) + if args.function == "CGC_synteny_plot": + ### dbcan_plot CGC_synteny_plot -i ../fefifo_8002_1.dbCAN --cgcid 'k141_145331|CGC1' + CGC_syntenic_with_PUL(args) + if args.function == "CGC_synteny_coverage_plot": + ### dbcan_plot CGC_synteny_coverage_plot -i ../fefifo_8002_1.dbCAN --cgcid 'k141_145331|CGC1' --readscount cgc.depth.txt + CGC_syntenic_with_PUL_abund(args) + if args.function == "bar_plot": + # dbcan_plot bar_plot -i fefifo_8002_1.CAZymeSub_abund,fefifo_8002_7.CAZymeSub_abund --samples fefifo_8002_1,fefifo_8002_7 --vertical_bar --col Substrate --value 'host glycan,starch,pectin,xylan,glycogen,cellulose,sucrose,mucin,arabinoxylan' + bar_plot(args) + if args.function == "heatmap_plot": + heatmap_plot(args) +if __name__ =="__main__": ### for test + #split_uniInput_dbcansub("uniInput",32,"a","a",10,0.5) + pass diff --git a/dbcan/utils/simplify_cgc.py b/dbcan/utils/simplify_cgc.py deleted file mode 100644 index d241f4491..000000000 --- a/dbcan/utils/simplify_cgc.py +++ /dev/null @@ -1,103 +0,0 @@ -#!/usr/bin/env python3 -######################################################### -# Written by Qiwei Ge in Yin Lab at UNL -# A readable version of cgc -######################################################### - -import os -import sys - -def simplify_output(inFile): - try: - text = open(inFile).readlines() - text = [line.strip() for line in text] - except: - print("fail to read") - exit(-1) - dir = os.path.dirname(inFile) - # os.remove(dir+'cgc.out') - annotation = '' - if '' in text: - text.remove('') - with open(dir + '/cgc_standard.out', 'a') as f: - f.write("CGC#\tGene Type\tContig ID\tProtein ID\tGene Start\tGene Stop\tDirection\tProtein Family\n") - f.close() - for i in range(len(text)): - simplified_line = [] - if '+++++' not in text[i]: - each_line = text[i].split('\t') - simplified_line.append(each_line[4]) - simplified_line.append(each_line[1]) - simplified_line.append(each_line[5]) - simplified_line.append(each_line[8]) - simplified_line.append(each_line[6]) - simplified_line.append(each_line[7]) - simplified_line.append(each_line[9]) - - if '' in each_line: - each_line.remove('') - if 'TC' in each_line[1]: # need fix split order if needed - annotation = each_line[10].split('|')[3].split(';')[0] - simplified_line.append(annotation) - elif 'CAZyme' in each_line[1]: # need fix split order if needed - annotation = each_line[10].split(';')[0].split('=')[1] - simplified_line.append(annotation) - elif 'STP' in each_line[1]: # need fix split order if needed - pre_annotation = each_line[10].split(';')[0].split('=')[1].split('|') - STP_counter = 0 - for i in pre_annotation: #hard code - if 'STP' in i: - STP_counter = STP_counter + 1 - if STP_counter > 1: - pre_annotation = each_line[10].split(';')[0].split('=')[1].split(',') - STP_list = [] - for STP in pre_annotation: - if 'STP' in STP: - STP_list.append(STP.split('|')[1]) - annotation = ('+').join(STP_list) - elif STP_counter == 1: - annotation = each_line[10].split(';')[0].split('=')[1].split('|')[1] - else: - annotation = 'none' - simplified_line.append(annotation) - elif 'TF' in each_line[1]: - pre_annotation = each_line[10].split(';')[0].split('=')[1].split('|') - TF_counter = 0 - for i in pre_annotation: #hard code - if 'DBD-Pfam' in i: - TF_counter = TF_counter + 1 - if TF_counter > 1: - pre_annotation = each_line[10].split(';')[0].split('=')[1].split(',') - TF_list = [] - for TF in pre_annotation: - if 'DBD-Pfam' in TF: - TF_list.append(TF.split('|')[1]) - annotation = ('+').join(TF_list) - elif TF_counter == 1: - annotation = each_line[10].split(';')[0].split('=')[1].split('|')[1] - if 'DBD-SUPERFAMILY' in annotation: - annotation = annotation.split(',')[0] - else: - annotation = 'none' - simplified_line.append(annotation) - elif 'null' in each_line[1]: - simplified_line[3] = each_line[10].split('=')[1] - annotation = 'null' - simplified_line.append(annotation) - else: - annotation = 'empty line' - simplified_line.append(annotation) - simplified_line = '\t'.join(simplified_line) - - with open(dir + "/cgc_standard.out", 'a') as f: - f.write(simplified_line+'\n') - f.close() - else: - pass - - - -# if __name__ == "__main__": -# cgc_file = sys.argv[1] - -# simplify_output(cgc_file) diff --git a/dbcan/utils/utils.py b/dbcan/utils/utils.py new file mode 100644 index 000000000..95d06af43 --- /dev/null +++ b/dbcan/utils/utils.py @@ -0,0 +1,629 @@ +from Bio import SeqIO +from itertools import (takewhile, repeat) +import time +from subprocess import Popen, call, check_output +from dbcan.cli import hmmer_parser +import argparse,os +import numpy as np +''' +design some functions and classes for dbCAN tutorial + +''' + +def fq_file_line_count(file_name): + if not file_name.endswith(".gz"): ### not gz files + buffer = 1024 * 1024 + with open(file_name) as f: + buf_gen = takewhile(lambda x: x, (f.read(buffer) for _ in repeat(None))) + return sum(buf.count('\n') for buf in buf_gen)/4 + else: + r = os.popen("zcat " + file_name + " | echo $((`wc -l`/4))") + text = r.read() + r.close() + return int(text) + +def total_mapped_reads_count(file_name): + ### sum the total mapped reads count + total_mapped_reads = 0 + for line in open(file_name): + lines = line.split() + total_mapped_reads += int(lines[-1]) + return total_mapped_reads + +class abund_parameters(): + def __init__(self,args): + self.input = args.input if args.input.endswith("/") else args.input +"/" + self.R1 = args.R1 + self.R2 = args.R2 + self.bedtools = args.bedtools + #self.output = args.function + "_" + args.output + self.CAZyme_annotation = self.input + "overview.txt" + self.dbCANsub_substrate_annotation = self.input + "dbcan-sub.hmm.out" + self.PUL_substrate_annotation = self.input + "substrate.out" + self.PUL_annotation = self.input + "cgc_standard.out" + self.function = args.function + self.parameters_check() + + def parameters_check(self): + if self.function == "fam_abund": + print("You are estimating the abundance of CAZyme!") + self.output = "fam_abund.out" + if not os.path.exists(self.CAZyme_annotation): + print(f"CAZyme annotation file {self.CAZyme_annotation} dose not exit, please check run_dbcan finish status!") + exit(1) + if self.function == "fam_substrate_abund": + self.output = "fam_substrate_abund.out" + print("You are estimating the abundance of Substrate according to dbCAN-sub!") + if not os.path.exists(self.dbCANsub_substrate_annotation): + print(f"CAZyme annotation file {self.dbCANsub_substrate_annotation} dose not exit, please check run_dbcan finish status!") + exit(1) + if self.function == "CGC_abund": + self.output = "CGC_abund.out" + print("You are estimating the abundance of CGC!") + if not os.path.exists(self.PUL_annotation): + print(f"PUL annotation file {self.PUL_annotation} dose not exit, please check run_dbcan finish status!") + exit(1) + if self.function == "CGC_substrate_abund": + print("You are estimating the abundance of CGC substrate!") + if not os.path.exists(self.PUL_substrate_annotation): + print(f"CGC substrate prediction file {self.PUL_substrate_annotation} dose not exit, please check run_dbcan finish status!") + exit(1) + if not os.path.exists(self.PUL_annotation): + print(f"CGC annotation file {self.PUL_annotation} dose not exit, please check run_dbcan finish status!") + exit(1) + if self.R2: + self.ngs_pe = True + print("Reads are pair end!") + else: + self.ngs_pe = False + print("Reads are single end!") + if not os.path.exists(self.bedtools): + print(f"Reads count file {self.bedtools} dose not exit!") + exit(1) + +class bedtools_read_count(): + def __init__(self,lines): + self.seqid = lines[0] + self.length = int(lines[2]) + self.read_count = int(lines[3]) + def __repr__(self): + return "\t".join([str(getattr(self, value)) for value in vars(self)]) + +def ReadBedtoos(filename): + lines = open(filename).readlines() + seqid2info = {line.split()[0]:bedtools_read_count(line.split()) for line in lines[1:]} + normalized_tpm = 0. + for seqid in seqid2info: + seqid_depth = seqid2info[seqid] + normalized_tpm += seqid_depth.read_count/seqid_depth.length + return seqid2info,normalized_tpm + +#Gene ID EC# HMMER dbCAN_sub DIAMOND #ofTools +#fefifo_8002_1_00016 - GH25(447-633) GH25_e123 GH25 3 +#fefifo_8002_1_00040 3.2.1.-:3|3.2.1.63:2 GH95(1-376) GH95_e1 GH95 3 + +def Is_EC(ec): + return ec[0].isdigit() + +def Clean_Hmmer_sub(pred): + preds = pred.split("+") + clean_preds = [] + for f in preds: + f = f.split("(")[0] + clean_preds.append(f) + #if "_" in f: ### also include subfamily + # f_s = f.split("_")[0] ## get family + # clean_preds.append(f_s) + return list(set(clean_preds)) + +def Clean_diamond(pred): + preds = pred.split("+") + clean_preds = [] + for f in preds: + if Is_EC(f): ### exclude EC + continue + f = f.split("(")[0] + clean_preds.append(f) + #if "_" in f: ### also include subfamily + # f_s = f.split("_")[0] ## get family + # clean_preds.append(f_s) + return list(set(clean_preds)) + +class OverView(): + def __init__(self,lines): + self.seqid = lines[0] + self.ECs = list(set([i.split(":")[0] for i in lines[1].split("|") if i.split(":")[0] != '-'])) + self.hmmer = Clean_Hmmer_sub(lines[2]) if lines[2] != "-" else [] + self.dbcan_sub = Clean_Hmmer_sub(lines[3]) if lines[3] != "-" else [] + self.diamond = Clean_diamond(lines[4]) if lines[4] != "-" else [] + self.justify_final_pred() + def __repr__(self): + outline = [] + if not self.preds: + return "" + for value in vars(self): + attr = getattr(self, value) + if isinstance(attr,list): + outline.append("+".join(attr)) + else: + outline.append(str(attr)) + return "\t".join(outline) + ### dbcan_sub is for substrate prediction + def justify_final_pred(self): + union_fam = list(set(self.hmmer)|set(self.dbcan_sub)) + self.ECs ## union + if union_fam: + self.preds = union_fam + else: + self.preds = [] + +def ReadOverView(filename): + return {line.split()[0]:OverView(line.split()) for line in open(filename).readlines()[1:]} + +#dbCAN subfam Subfam Composition Subfam EC Substrate Profile Length Gene ID Gene Length E Value Profile Start Profile End Gene Start Gene End Coverage +#GH25_e123 GH25:8 - - 181 fefifo_8002_1_00016 644 7.5e-68 1 181 448 634 0.994475138121547 0.994475138121547 + +class DbcanSub_line(): + def __init__(self,lines): + self.dbcan_subfam = lines[0] + self.subfam_comp = lines[1] + self.subfam_EC = lines[2] + self.substrate = lines[3] + self.hmmlen = lines[4] + self.seqid = lines[5] + self.protlen = lines[6] + self.evalue = lines[7] + def __repr__(self): + return "\t".join([str(getattr(self, value)) for value in vars(self)]) + +def Read_dbcansub_out(filename): + return {line.split("\t")[5]:DbcanSub_line(line.rstrip().split("\t")) for line in open(filename).readlines()[1:]} + +#CGC# Gene Type Contig ID Protein ID Gene Start Gene Stop Direction Protein Family +#CGC1 TC k141_70330 fefifo_8002_1_00592 100 1263 - 2.A.1.7.16 + +class cgc_standard_line(): + def __init__(self,lines): + self.cgcid = lines[2] + "|" + lines[0] + self.cgc_order = lines[0] + self.gene_type = lines[1] ### CAZyme + self.contig_id = lines[2] + self.seqid = lines[3] + self.gene_start = int(lines[4]) + self.gene_end = int(lines[5]) + self.strand = lines[6] + self.protfam = lines[7] + def __repr__(self): + return "\t".join([str(getattr(self, value)) for value in vars(self)]) + +def Read_cgc_standard_out(filename): + seqid2records = {};cgcid2records ={} + for line in open(filename).readlines()[1:]: + lines = line.rstrip("\n").split("\t") + cgcid = lines[2] + "|" + lines[0] + tmp_record = cgc_standard_line(line.rstrip().split("\t")) + seqid2records[lines[3]] = tmp_record + cgcid2records.setdefault(cgcid,[]).append(tmp_record) + return seqid2records,cgcid2records + +## the prediction of cgc substrate includes two results, homologous searach and major votting +#cgcid PULID dbCAN-PUL substrate bitscore signature pairs dbCAN-sub substrate dbCAN-sub substrate score +#k141_145965|CGC1 PUL0538 galactomannan 3075.0 TC-TC;CAZyme-CAZyme;CAZyme-CAZyme;CAZyme-CAZyme;CAZyme-CAZyme;TC-TC + +class cgc_substrate(): + def __init__(self,lines): + self.cgcid = lines[0] + self.homo_pul = lines[1] + self.homo_sub = lines[2] + self.bitscore = lines[3] + self.signature_pairs = lines[4] + self.major_voting_sub = lines[5] + self.major_voting_score = lines[6] + def __repr__(self): + return "\t".join([str(getattr(self, value)) for value in vars(self)]) + +def Read_cgc_substrate(filename): + return {line.split()[0]:cgc_substrate(line.rstrip("\n").split("\t")) for line in open(filename).readlines()[1:]} + +def get_length_readcount(seqid2dbcan_annotation,seqid2readcount): + for seqid in seqid2dbcan_annotation: + read_count = seqid2readcount.get(seqid,"") + if not read_count: + print(f"Can not find read count information for CAZyme: {seqid}") + exit(1) + seqid_annotation = seqid2dbcan_annotation[seqid] + seqid_annotation.length = read_count.length + seqid_annotation.read_count = read_count.read_count + +class CAZyme_Abundance_estimate(): + def __init__(self,parameters): + self.pars = parameters + #print("Counting reads!") + #self.fq_reads_count = fq_file_line_count(self.pars.R1) + self.fq_reads_count = total_mapped_reads_count(self.pars.bedtools) + print(f"Total reads count: {self.fq_reads_count}!") + seqid2readcount,normalized_tpm = ReadBedtoos(parameters.bedtools) + self.normalized_tpm = normalized_tpm + ### read overview to + if parameters.function == "fam_abund": + seqid2dbcan_annotation = ReadOverView(parameters.CAZyme_annotation) + ### read dbsub.out to + if parameters.function == "fam_substrate_abund": + seqid2dbcan_annotation = Read_dbcansub_out(parameters.dbCANsub_substrate_annotation) + ### read cgc_standard.out + if parameters.function == "CGC_abund": ### seqid = cgcid + seqid2dbcan_annotation,cgcid2cgc_standard = Read_cgc_standard_out(parameters.PUL_annotation) + self.cgcid2cgc_standard = cgcid2cgc_standard + ### read cgc_standard.out and sub.prediction.out + if parameters.function == "CGC_substrate_abund": ### seqid = cgcid + seqid2dbcan_annotation,cgcid2cgc_standard = Read_cgc_standard_out(parameters.PUL_annotation) + cgcid2cgc_substrate = Read_cgc_substrate(parameters.PUL_substrate_annotation) + self.cgcid2cgc_standard = cgcid2cgc_standard + self.cgcid2cgc_substrate = cgcid2cgc_substrate + + get_length_readcount(seqid2dbcan_annotation,seqid2readcount) + self.seqid2dbcan_annotation = seqid2dbcan_annotation + + ### reads per transcript + ### FPKM = ------------------------------- + ### total reads transcript length + ### ----------- X ----------------- + ### 10E6 1000 + + def Cal_Seq_Abundance(self,method="FPKM"): + if method == "FPKM": + for seqid in self.seqid2dbcan_annotation: ### for each protein + annotation = self.seqid2dbcan_annotation[seqid] + normalized_total_reads_counts = self.fq_reads_count/pow(10,6) + normalized_seq_length = annotation.length/1000.0 ## CDS length + annotation.abund = annotation.read_count/normalized_total_reads_counts/normalized_seq_length + if method == "RPM": + for seqid in self.seqid2dbcan_annotation: + annotation = self.seqid2dbcan_annotation[seqid] + normalized_total_reads_counts = self.fq_reads_count/pow(10,6) + annotation.abund = annotation.read_count/normalized_total_reads_counts + if method == "TPM": + for seqid in self.seqid2dbcan_annotation: + annotation = self.seqid2dbcan_annotation[seqid] + normalized_total_reads_counts = annotation.read_count/annotation.length*pow(10,6) + annotation.abund = normalized_total_reads_counts/self.normalized_tpm + + def Cal_Famliy_Abundance(self): + family2seqid = {} + for seqid in self.seqid2dbcan_annotation: + annotation = self.seqid2dbcan_annotation[seqid] + if annotation.preds: ### if dbcan has predecitions + for family in annotation.preds: + family2seqid.setdefault(family,[]).append(seqid) + self.family2seqid = family2seqid + family2abund = {familyid:0.0 for familyid in family2seqid} + for familyid in family2seqid: + for seqid in family2seqid[familyid]: + family2abund[familyid] += self.seqid2dbcan_annotation[seqid].abund ### sum of all seqs + self.family2abund = family2abund + + def Cal_Substrate_Abundance(self): + substrate2seqid = {} + for seqid in self.seqid2dbcan_annotation: + annotation = self.seqid2dbcan_annotation[seqid] + substrates_tmp = annotation.substrate.replace("and",",").split(",") + substrates = list(set([tmp.strip() for tmp in substrates_tmp if tmp != "-" and tmp])) + for sub in substrates: + substrate2seqid.setdefault(sub,[]).append(seqid) + + substrate2abund = {sub:0.0 for sub in substrate2seqid} + for sub in substrate2seqid: + for seqid in substrate2seqid[sub]: + substrate2abund[sub] += self.seqid2dbcan_annotation[seqid].abund + self.substrate2abund = substrate2abund + self.substrate2seqid = substrate2seqid + + def Cal_PUL_Abundance(self): + cgcid2seqid = {}; cgcid2_standard_cgc_lines = {}; + for seqid in self.seqid2dbcan_annotation: + annotation = self.seqid2dbcan_annotation[seqid] + cgcid = annotation.cgcid + cgcid2seqid.setdefault(cgcid,[]).append(seqid) + cgcid2abund = {cgcid:0.0 for cgcid in cgcid2seqid} + cgcid2seqabund = {} + for cgcid in cgcid2seqid: + for seqid in cgcid2seqid[cgcid]: + #cgcid2abund[cgcid] += self.seqid2dbcan_annotation[seqid].abund + cgcid2seqabund.setdefault(cgcid,[]).append(self.seqid2dbcan_annotation[seqid].abund) + + ### calculate mean abund + for cgcid in cgcid2seqid: + cgcid2abund[cgcid] = np.mean(cgcid2seqabund[cgcid]) + + self.cgcid2abund = cgcid2abund + self.cgcid2seqid = cgcid2seqid + self.cgcid2seqabund = cgcid2seqabund + + ### self.seqid2dbcan_annotation,self.cgcid2cgc_standard + ### self.cgcid2seqid, cgcid2seqabund + ### self.cgcid2cgc_substrate + + def Cal_PUL_Substrate_Abundance(self): + ''' two substates prediction''' + cgcsubstrate2cgcid_homo = {}; cgcsubstrate2cgcid_major_votting = {} + for cgcid in self.cgcid2cgc_substrate: + cgc_substrate_line = self.cgcid2cgc_substrate[cgcid] + if cgc_substrate_line.homo_sub and cgc_substrate_line.homo_sub != "X":### homo has prediction + cgcsubstrate2cgcid_homo.setdefault(cgc_substrate_line.homo_sub,[]).append(cgcid) + + if cgc_substrate_line.major_voting_sub: ### homo has prediction + substrates = cgc_substrate_line.major_voting_sub.split(",") + for tmp_sub in substrates: + cgcsubstrate2cgcid_major_votting.setdefault(tmp_sub,[]).append(cgcid) + + ### for homologous search substrate + cgcsubstrate2abunds_homo = {}; self.cgcsubstrate2cgcid_homo = cgcsubstrate2cgcid_homo + for substrate in cgcsubstrate2cgcid_homo: + cgcids = cgcsubstrate2cgcid_homo[substrate] + for cgcid in cgcids: + cgc_abunds = self.cgcid2seqabund[cgcid] ### list constis of sequence abundance + #cgcsubstrate2abunds_homo.setdefault(substrate,[]).extend(cgc_abunds) + cgcsubstrate2abunds_homo.setdefault(substrate,[]).append(np.mean(cgc_abunds)) + self.cgcsubstrate2abunds_homo = cgcsubstrate2abunds_homo + ### for major votting substrate + cgcsubstrate2abunds_major_votting = {}; + self.cgcsubstrate2cgcid_major_votting = cgcsubstrate2cgcid_major_votting + for substrate in cgcsubstrate2cgcid_major_votting: + cgcids = cgcsubstrate2cgcid_major_votting[substrate] + for cgcid in cgcids: + cgc_abunds = self.cgcid2seqabund[cgcid] ### list constis of sequence abundance + #cgcsubstrate2abunds_major_votting.setdefault(substrate,[]).extend(cgc_abunds) + cgcsubstrate2abunds_major_votting.setdefault(substrate,[]).append(np.mean(cgc_abunds)) + self.cgcsubstrate2abunds_major_votting = cgcsubstrate2abunds_major_votting + + def output_cgcsubstrate_abund(self): + ### for cgc substrate homologous + cgc_substrates = []; abunds = [] ; cgcids = [] ; cgcids_abund = [] # list + for cgc_substrate in self.cgcsubstrate2abunds_homo: + abunds.append(np.sum(self.cgcsubstrate2abunds_homo[cgc_substrate])) + cgc_substrates.append(cgc_substrate) + cgcids.append(self.cgcsubstrate2cgcid_homo[cgc_substrate]) + cgcids_abund.append(self.cgcsubstrate2abunds_homo[cgc_substrate]) + abund_sortidx = np.argsort(abunds)[::-1] + print(f"Writring abundance of substrate predicted on dbCAN-PUL to file CGC_substrate_PUL_homology.out!") + with open("CGC_substrate_PUL_homology.out",'w') as f: + f.write("Substrate\tAbundance(sum of CGC)\tcgcs\tcgcs_abunds\n") + for idx in abund_sortidx: + #if cgc_substrates[idx] == "pectin": + # print(len(cgcids[idx]),len(cgcids_abund[idx])) + cgc = ";".join(cgcids[idx]) + abunds_tmp = ";".join([str(round(abund,3)) for abund in cgcids_abund[idx]]) + f.write(f"{cgc_substrates[idx]}\t{round(abunds[idx],3)}\t{cgc}\t{abunds_tmp}\n") + + ### for cgc substrate major votting + cgc_substrates = []; abunds = [] ; cgcids = []; cgcids_abund = [] # list + for cgc_substrate in self.cgcsubstrate2abunds_major_votting: + abunds.append(np.sum(self.cgcsubstrate2abunds_major_votting[cgc_substrate])) + cgc_substrates.append(cgc_substrate) + cgcids.append(self.cgcsubstrate2cgcid_major_votting[cgc_substrate]) + cgcids_abund.append(self.cgcsubstrate2abunds_major_votting[cgc_substrate]) + abund_sortidx = np.argsort(abunds)[::-1] + print(f"Writring abundance of substrate predicted by major votting to file CGC_substrate_majority_voting.out!") + with open("CGC_substrate_majority_voting.out",'w') as f: + f.write("Substrate\tAbundance(sum of CGC)\tcgcs\tcgcs_abunds\n") + for idx in abund_sortidx: + cgc = ";".join(cgcids[idx]) + abunds_tmp = ";".join([str(round(abund,3)) for abund in cgcids_abund[idx]]) + f.write(f"{cgc_substrates[idx]}\t{round(abunds[idx],3)}\t{cgc}\t{abunds_tmp}\n") + + + ### need to consider HMM model, subfamily and EC + def output_family_abund(self,method="family"): + fams = [] ; abunds = [];seqs = [] + for familyid in self.family2abund: + fams.append(familyid) + abunds.append(self.family2abund[familyid]) + seqs.append(self.family2seqid[familyid]) + abund_sortidx = np.argsort(abunds)[::-1] + print(f"Writring family abundance to file fam_abund.out!") + print(f"Writring subfamily(dbsub) abundance to file subfam_abund.out!") + print(f"Writring EC abundance to file EC_abund.out!") + fam_file = open(self.pars.output,'w') + subfam_file = open("subfam_abund.out",'w') + EC_file = open("EC_abund.out",'w') + + fam_file.write("Family\tAbundance\tSeqNum\n") + subfam_file.write("Subfamily\tAbundance\tSeqNum\n") + EC_file.write("EC\tAbundance\tSeqNum\n") + + for idx in abund_sortidx: + famid = fams[idx] + if Is_EC(famid): + EC_file.write(f"{fams[idx]}\t{round(abunds[idx],3)}\t{len(seqs[idx])}\n") + elif "_e" in famid: + subfam_file.write(f"{fams[idx]}\t{round(abunds[idx],3)}\t{len(seqs[idx])}\n") + else: + fam_file.write(f"{fams[idx]}\t{round(abunds[idx],3)}\t{len(seqs[idx])}\n") + + def output_substrate_abund(self): + subs = [] ; abunds = [] ; genes = [] + for sub in self.substrate2abund: + subs.append(sub) + abunds.append(self.substrate2abund[sub]) + genes.append(self.substrate2seqid[sub]) + abund_sortidx = np.argsort(abunds)[::-1] + print(f"Writring substrate abundance to file {self.pars.output}!") + with open(self.pars.output,'w') as f: + f.write("Substrate\tAbundance\tGeneID\n") + for idx in abund_sortidx: + if subs[idx]: + f.write(f"{subs[idx]}\t{round(abunds[idx],3)}\t{';'.join(genes[idx])}\n") + + def output_cgc_abund(self): + cgcids = [] ; abunds = []; seqids = []; seq_abunds = [] + cgc_standard_records = [] + for cgcid in self.cgcid2abund: + cgcids.append(cgcid) + abunds.append(self.cgcid2abund[cgcid]) + seqids.append(self.cgcid2seqid[cgcid]) + seq_abunds.append(self.cgcid2seqabund[cgcid]) + cgc_standard_records.append(self.cgcid2cgc_standard[cgcid]) + + abund_sortidx = np.argsort(abunds)[::-1] + print(f"Writring CGC abundance to file {self.pars.output}!") + with open(self.pars.output,'w') as f: + f.write("#CGCID\tAbundance(mean)\tSeqid\tSeq_abund\tFams\n") + for idx in abund_sortidx: + seqs_tmp_abunds = ";".join([str(round(i,3)) for i in seq_abunds[idx]]) + seqs_tmp = ";".join(seqids[idx]) + fams = ";".join(record.protfam if record.gene_type== "CAZyme" else record.gene_type for record in cgc_standard_records[idx]) + f.write(f"{cgcids[idx]}\t{round(abunds[idx],3)}\t{seqs_tmp}\t{seqs_tmp_abunds}\t{fams}\n") + +def CAZyme_abundance(args): + paras = abund_parameters(args) + CAZyme_abund = CAZyme_Abundance_estimate(paras) + CAZyme_abund.Cal_Seq_Abundance(args.abundance) ### calculate abundance + CAZyme_abund.Cal_Famliy_Abundance() ### calculate abundance + CAZyme_abund.output_family_abund() + +def CAZymeSub_abundance(args): + paras = abund_parameters(args) + CAZymeSub_abund = CAZyme_Abundance_estimate(paras) + CAZymeSub_abund.Cal_Seq_Abundance(args.abundance) + CAZymeSub_abund.Cal_Substrate_Abundance() + CAZymeSub_abund.output_substrate_abund() + +def PUL_abundance(args): + paras = abund_parameters(args) + PUL_abund = CAZyme_Abundance_estimate(paras) + PUL_abund.Cal_Seq_Abundance(args.abundance) + PUL_abund.Cal_PUL_Abundance() + PUL_abund.output_cgc_abund() + +def PUL_Substrate_abundance(args): + paras = abund_parameters(args) + PUL_abund = CAZyme_Abundance_estimate(paras) + PUL_abund.Cal_Seq_Abundance(args.abundance) + PUL_abund.Cal_PUL_Abundance() + PUL_abund.Cal_PUL_Substrate_Abundance() + PUL_abund.output_cgcsubstrate_abund() + +def Get_GFF_attri(field,ID="ID"): + fields = field.split(";") + for field in fields: + if field.startswith(ID+"="): + return field.split("=")[-1] + return "" + +def read_PULgff(filename): + geneID2feature = {} + for line in open(filename): + if not line.startswith("#"): + lines = line.rstrip("\n").split("\t") + if lines[2] == "CDS": + ID = Get_GFF_attri(lines[-1]).split(".")[0] + gene_name = Get_GFF_attri(lines[-1],"Name") + locus_tag = Get_GFF_attri(lines[-1],"locus_tag") + protein_id = Get_GFF_attri(lines[-1],"protein_id") + geneID2feature[ID] = (gene_name,locus_tag,protein_id) + return geneID2feature + +def read_cgcgff(filename): + geneID2feature = {} ; line_order = 0 + for line in open(filename): + if not line.startswith("#"): + line_order += 1 + lines = line.rstrip("\n").split("\t") + ID = Get_GFF_attri(lines[-1]) + signature = Get_GFF_attri(lines[-1],"DB") + family = lines[2] + geneID2feature[ID] = (line_order,family,signature) + return geneID2feature + +def generate_PULfaa(args): + ''' read protein sequence from PUL.faa,cgc.gff and PUL.gff ''' + ''' cgc.gff provide coordinate information and gene, locus and protid''' + + if not args.input.endswith("/"): + args.input = args.input +"/" + + ### get all dbCAN_PUL folders + dbCAN_PUL_folders = os.listdir(args.input+"dbCAN-PUL") + seqs = [] + for folder in dbCAN_PUL_folders: + if folder.endswith(".out"): ### PULID.out + PULID = folder.split(".")[0] + PULID_num = int(PULID[3:len(PULID)]) + if PULID_num < 602 or PULID_num > 656: + continue + print(f"deal with PUL {PULID}") + faa_file = args.input + "dbCAN-PUL/" + folder + "/" + PULID +".faa" + cgc_gff = args.input + "dbCAN-PUL/" + folder + "/" + "cgc.gff" + gff = args.input + "dbCAN-PUL/" + folder + "/" + PULID +".gff" + if not os.path.exists(faa_file): + print(f"File {faa_file} does not exit!") + continue + if not os.path.exists(cgc_gff): + print(f"File {cgc_gff} does not exit!") + continue + if not os.path.exists(gff): + print(f"File {gff} does not exit!") + continue + PUL_geneid2feature = read_PULgff(gff) + cgcgff_geneid2feature = read_cgcgff(cgc_gff) + seqid2seqs = {} + for seq in SeqIO.parse(faa_file,'fasta'): + oldseqid = seq.id + #print(oldseqid,PUL_geneid2feature,cgcgff_geneid2feature) + if oldseqid in cgcgff_geneid2feature and oldseqid in PUL_geneid2feature: + #pulid_order:pulid:gene:locus_tag:protein_id:type:family + pulid_order = PULID + "_" +str(cgcgff_geneid2feature[oldseqid][0]) + gene = PUL_geneid2feature[oldseqid][0] + locus_tag = PUL_geneid2feature[oldseqid][1] + protein_id = PUL_geneid2feature[oldseqid][2] + type1 = cgcgff_geneid2feature[oldseqid][1] + family = cgcgff_geneid2feature[oldseqid][2] + new_seqid = f"{pulid_order}:{PULID}:{gene}:{locus_tag}:{protein_id}:{type1}:{family}" + seq.id = new_seqid + seq.description = "" + seqs.append(seq) + SeqIO.write(seqs,"PUL_updated.faa",'fasta') + +def parse_argv(): + + usage = ''' + %(prog)s [positional arguments] [options] + ----------------------------------------- + fam_abund [calculate the abundance of CAZyme]. Example usage: dbcan_utils fam_abund -bt samfiles/fefifo_8002_7.depth.txt -i fefifo_8002_7.dbCAN + ----------------------------------------- + fam_substrate_abund [calculate the abundance of CAZyme substrate]. Example usage: dbcan_utils fam_substrate_abund -bt samfiles/fefifo_8002_7.depth.txt -i fefifo_8002_7.dbCAN + ----------------------------------------- + CGC_abund [calculate the abundance of PUL]. Example usage: dbcan_utils CGC_abund -bt samfiles/fefifo_8002_7.depth.txt -i fefifo_8002_7.dbCAN + ----------------------------------------- + CGC_substrate_abund [calculate the abundance of PUL substrate]. Example usage: dbcan_utils CGC_substrate_abund -bt samfiles/fefifo_8002_7.depth.txt -i fefifo_8002_7.dbCAN + ''' + parser = argparse.ArgumentParser(description='Calculate the abundance CAZyme, PUL and substrate.',prog='dbcan_utils',usage=usage) + parser.add_argument('function', help='What function will be used to analyze?',choices=["fam_abund","fam_substrate_abund","CGC_abund","CGC_substrate_abund","generate_PULfaa"]) + parser.add_argument('-i','--input',help='dbCAN CAZyme annotation output folder.',default="output",required=True) + parser.add_argument('-bt','--bedtools',help='bedtools gene reads count results.') + parser.add_argument('-1','--R1',help='R1 reads, support gz compress file') + parser.add_argument('-2','--R2',help='R2 reads, support gz compress file, None for single end sequencing',default=None) + parser.add_argument('-o','--output',help='output files',default="output") + parser.add_argument('-a','--abundance',default="RPM",help='normalized method',choices=["FPKM","RPM","TPM"]) + parser.add_argument('--db_dir', default="db", help='dbCAN database directory') + args = parser.parse_args() + return args + +def main(): + args = parse_argv() + if args.function == "fam_abund": + # dbcan_utils CAZyme_abund -bt samfiles/fefifo_8002_7.depth.txt -i fefifo_8002_7.dbCAN + CAZyme_abundance(args) + if args.function == "fam_substrate_abund": + # dbcan_utils CAZymeSub_abund -bt samfiles/fefifo_8002_7.depth.txt -i fefifo_8002_7.dbCAN + CAZymeSub_abundance(args) + if args.function == "CGC_abund": + # dbcan_utils PUL_abund -bt samfiles/fefifo_8002_7.depth.txt -i fefifo_8002_7.dbCAN + PUL_abundance(args) + if args.function == "CGC_substrate_abund": + # dbcan_utils PULSub_abund -bt samfiles/fefifo_8002_7.depth.txt -i fefifo_8002_7.dbCAN + PUL_Substrate_abundance(args) + if args.function == "generate_PULfaa": ### update dbCAN-PUL faa proteins + # dbcan_utils generate_PULfaa -i db + generate_PULfaa(args) +if __name__ =="__main__": ### for test + pass diff --git a/dbcan_cli/cgc_process_json.py b/dbcan_cli/cgc_process_json.py deleted file mode 100644 index 3cfb78daf..000000000 --- a/dbcan_cli/cgc_process_json.py +++ /dev/null @@ -1,127 +0,0 @@ -########################## -# to generate json file for all cgc_stardard.out file from run_dbcan -# use: python cgc_process_json.py -i cgc_standard.out -o cgc_standard.out.json -# written by Roland Madadjim in Cui's lab at Soc, UNL -# last updated: 12/09/2022 -########################## - -#from __future__ import print_function -import os -import json -import time -import argparse -import pandas as pd -import numpy as np - - -class PrePro: - - def __init__(self, data): - self.df = data - - - def extract_gs(self, dataList): - i = 0 - geneL = [] - gene = list(map(lambda e : "{}".format(e['Gene_Type']),dataList)) - pfam = list(map(lambda e : "{}".format(e['Protein_Family']),dataList)) - while i < len(dataList): - if (gene[i] == 'CAZyme'): - s = pfam[i] - geneL.append(s) - elif (gene[i] == 'TC'): - s = pfam[i] - geneL.append(s) - elif (gene[i] == 'TF'): - s = pfam[i] - geneL.append(s) - elif (gene[i] == 'STP'): - s = pfam[i] - geneL.append(s) - elif (gene[i] == 'Null_Type'): - s = 'NA' - geneL.append(s) - i=i+1 - gene_st = '-'.join(geneL) - return gene_st - - - def pul_section(self): - for (cgc_id), df_pul_grouped in self.df.groupby("cgc_id"): - datalist = list(self.cluster_section(df_pul_grouped)) - gene_str = self.extract_gs(datalist) - yield {cgc_id: { - "changelog": [], - "Cluster_ID": cgc_id, - "Gene_String":gene_str, - "Contig_ID": self.df.loc[self.df['cgc_id'] == cgc_id, 'contig_id'].iloc[0], - "ncbi_species_tax_id": [], - "organism_name": [], - "publication": [], - "Protein": list(self.cluster_section(df_pul_grouped)) - #"dbCan_Pul_accession": ID, # as string or integer? - #"publication": df.loc[df['ID'] == ID, 'PMID'].iloc[0], - } - } - - - def cluster_section(self, df_pul_grouped): - for (contig_id,gene_type,contig_id,protein_id,gene_start,gene_stop,direction,protein_family), df_puls in df_pul_grouped.groupby( - ["contig_id","gene_type","contig_id","protein_id","gene_start","gene_stop","direction","protein_family"] - ): - yield { - "protein_id": protein_id, - "Gene_Type": gene_type, - "Gene_Start": gene_start, - "Gene_Stop": gene_stop, - "Strand": direction, - "Protein_Family": protein_family, - } - - def run_dbCan_section(self, df_puls): - for row in df_puls.itertuples(): - yield { - #"Gene_Type": row.gene_type, - #"Gene_Start": row.gene_start, - #"Gene_Stop": row.gene_stop, - #"Strand": row.direction, - # "Protein_Family": row.protein_family - } - -def file_ext(choices,fname): - ext = os.path.splitext(fname)[1][1:] - if ext not in choices: - parser.error("File needs to be a .out or .csv") - return fname - -class CustomEncoder(json.JSONEncoder): - def default(self, obj): - if isinstance(obj, np.int64): - return str(obj) - return super().default(obj) - -def main(): - parser = argparse.ArgumentParser(description='Compiling Json from cgc_standard.out') - parser.add_argument('-i',required=True,help='path to output file (cgc_standard.out) file', type=lambda s:file_ext(("out","csv"),s)) - parser.add_argument('-o','--output') - args = parser.parse_args() - - with open(args.i) as file: ### input files - data = pd.read_csv(file, sep ='\t') - data.rename(columns = {'CGC#':'cgc_id','Gene Type':'gene_type','Contig ID':'contig_id','Protein ID':'protein_id','Gene Start':'gene_start','Gene Stop':'gene_stop','Direction':'direction','Protein Family':'protein_family'}, inplace = True) - data['gene_type'].fillna('Null_Type', inplace=True) - data['protein_family'].fillna('0', inplace=True) - p = PrePro(data) - - pul_list = list(p.pul_section()) - pul_dict = {} - for sub_dict in pul_list: - pul_dict.update(sub_dict) - jsonPuls = json.dumps(pul_dict, indent=4, cls=CustomEncoder) - - with open(args.output,"w") as outfile: - #with open("Json"+time.strftime("%Y%m%d%H%M%S")+".json","w") as outfile: - outfile.write(jsonPuls) - -if __name__ == "__main__": - main() diff --git a/dbcan_cli/hmmer_parser.py b/dbcan_cli/hmmer_parser.py deleted file mode 100644 index 3bfb94eb9..000000000 --- a/dbcan_cli/hmmer_parser.py +++ /dev/null @@ -1,63 +0,0 @@ -#!/usr/bin/env python3 -########################################################## -# hmmscan parser for dbCAN meta server -# -# Based off the hmmscan parser used in the dbCAN server, -# written by Dr. Yin -# -# Written by Tanner Yohe under the supervision -# of Dr. Yin in the YinLab at NIU. -# -# Updated by Le Huang from tips the contributor WATSON Mick , -# Thank you! -# -# Modified by Alex Fraser to have a run() method that can be called and returns data for better integration with other -# scripts. This script also retains the ability to be called from shell and output to pipe redirection. -# This file had to be renamed from "hmmscan-parser.py" to "hmmscan_parser.py" because of python module import conventions. -# Modified on 07/06/22 -# -# INPUT -# python hmmscan-parser-dbCANmeta.py [inputFile] [eval] [coverage] -# eval and coverage are optional, inputFile is required -# -updating info: -# -adds pid for every subprocess to make codes robust. -# Last updated: 1/10/19 -########################################################### - -from subprocess import call -import sys -import os - - -def run(input_file, eval_num=1e-15, coverage=0.35, verbose=False): - - tmpfile = "temp." + str(os.getpid()) - - call("cat "+input_file+" | grep -v '^#' | awk '{print $4,$6,$1,$3,$13,$16,$17,$18,$19}' | sed 's/ /\t/g' | sort -k 3,3 -k 8n -k 9n | perl -e 'while(<>){chomp;@a=split;next if $a[-1]==$a[-2];push(@{$b{$a[2]}},$_);}foreach(sort keys %b){@a=@{$b{$_}};for($i=0;$i<$#a;$i++){@b=split(/\t/,$a[$i]);@c=split(/\t/,$a[$i+1]);$len1=$b[-1]-$b[-2];$len2=$c[-1]-$c[-2];$len3=$b[-1]-$c[-2];if($len3>0 and ($len3/$len1>0.5 or $len3/$len2>0.5)){if($b[4]<$c[4]){splice(@a,$i+1,1);}else{splice(@a,$i,1);}$i=$i-1;}}foreach(@a){print $_.\"\n\";}}' > " + tmpfile, shell=True) - - output = "" - with open(tmpfile) as f: - for line in f: - row = line.rstrip().split('\t') - row.append(float(int(row[6])-int(row[5]))/int(row[1])) - if float(row[4]) <= float(eval_num) and float(row[-1]) >= float(coverage): - if verbose: - print('\t'.join([str(x) for x in row])) - output += '\t'.join([str(x) for x in row]) + '\n' - call(['rm', tmpfile]) - - return output - - -if __name__ == "__main__": - if len(sys.argv) > 3: - file = sys.argv[1] - eval_arg = float(sys.argv[2]) - coverage_arg = float(sys.argv[3]) - run(file, eval_arg, coverage_arg, verbose=True) - if len(sys.argv) > 1: - file = sys.argv[1] - run(file, verbose=True) - else: - print("Please give a hmmscan output file as the first command") - exit() diff --git a/dbcan_cli/hmmscan_parser.py b/dbcan_cli/hmmscan_parser.py deleted file mode 100644 index 1a8ead6fa..000000000 --- a/dbcan_cli/hmmscan_parser.py +++ /dev/null @@ -1,63 +0,0 @@ -#!/usr/bin/env python3 -########################################################## -# hmmscan parser for dbCAN meta server -# -# Based off the hmmscan parser used in the dbCAN server, -# written by Dr. Yin -# -# Written by Tanner Yohe under the supervision -# of Dr. Yin in the YinLab at NIU. -# -# Updated by Le Huang from tips the contributor WATSON Mick , -# Thank you! -# -# Modified by Alex Fraser to have a run() method that can be called and returns data for better integration with other -# scripts. This script also retains the ability to be called from shell and output to pipe redirection. -# This file had to be renamed from "hmmscan-parser.py" to "hmmscan_parser.py" because of python module import conventions. -# Modified on 07/06/22 -# -# INPUT -# python hmmscan-parser-dbCANmeta.py [inputFile] [eval] [coverage] -# eval and coverage are optional, inputFile is required -# -updating info: -# -adds pid for every subprocess to make codes robust. -# Last updated: 1/10/19 -########################################################### - -from subprocess import call -import sys -import os - - -def run(input_file, eval_num=1e-15, coverage=0.35, verbose=False): - - tmpfile = "temp." + str(os.getpid()) - - call("cat "+input_file+" | grep -v '^#' | awk '{print $1,$3,$4,$6,$13,$16,$17,$18,$19}' | sed 's/ /\t/g' | sort -k 3,3 -k 8n -k 9n | perl -e 'while(<>){chomp;@a=split;next if $a[-1]==$a[-2];push(@{$b{$a[2]}},$_);}foreach(sort keys %b){@a=@{$b{$_}};for($i=0;$i<$#a;$i++){@b=split(/\t/,$a[$i]);@c=split(/\t/,$a[$i+1]);$len1=$b[-1]-$b[-2];$len2=$c[-1]-$c[-2];$len3=$b[-1]-$c[-2];if($len3>0 and ($len3/$len1>0.5 or $len3/$len2>0.5)){if($b[4]<$c[4]){splice(@a,$i+1,1);}else{splice(@a,$i,1);}$i=$i-1;}}foreach(@a){print $_.\"\n\";}}' > " + tmpfile, shell=True) - - output = "" - with open(tmpfile) as f: - for line in f: - row = line.rstrip().split('\t') - row.append(float(int(row[6])-int(row[5]))/int(row[1])) - if float(row[4]) <= float(eval_num) and float(row[-1]) >= float(coverage): - if verbose: - print('\t'.join([str(x) for x in row])) - output += '\t'.join([str(x) for x in row]) + '\n' - call(['rm', tmpfile]) - - return output - - -if __name__ == "__main__": - if len(sys.argv) > 3: - file = sys.argv[1] - eval_arg = float(sys.argv[2]) - coverage_arg = float(sys.argv[3]) - run(file, eval_arg, coverage_arg, verbose=True) - if len(sys.argv) > 1: - file = sys.argv[1] - run(file, verbose=True) - else: - print("Please give a hmmscan output file as the first command") - exit() diff --git a/dbcan_cli/run_dbcan.py b/dbcan_cli/run_dbcan.py deleted file mode 100755 index d841ee506..000000000 --- a/dbcan_cli/run_dbcan.py +++ /dev/null @@ -1,906 +0,0 @@ -#!/usr/bin/env python3 -######################################################### -# dbCAN3 (Stand Alone Version) -# -# Written by Tanner Yohe in the Yin Lab at NIU -# Revised by Qiwei Ge in Yin Lab at UNL && Le Huang at NKU -# Updated by Le Huang at UNC, Mohamad Majd Raslan in the Yin Lab at NIU, Wei Li, Qiwei Ge in Dr.Yin's Lab at UNL -# Updated by Haidong Yi for reconstructing codes, Alex Fraser for adding functions. -# Updated by Jinfang Zheng in Yinlab at UNL, new function, substrate prediciton based on dbCAN-PUL and dbCAN-sub database. - -# Recent updated information: -# Sep/07/23: Replace hmmscan with hmmsearch. Update perl code [Le Huang, Yanbin Yin] -# Dec/15/22: 1.adding function to convert cgc_standard.out to json format. 2. adding function cgc_[Jinfang Zheng] -# Dec/06/22: fix gene ID in CGCfinder output file cgc.out[Jinfang Zheng] -# Nov/06/22: Using dbCAN_sub, eCAMI has been removed [Qiwei Ge] -# Jun/13/22: Allowing direct calls to main function from other scripts [Alex Fraser] -# Sep/29/22: Hotpep has been removed, added eCAMI tool. 2. cgc out reformatting. 3. Fixed multiple GT2s [Qiwei Ge] -# -# Accepts user input -# Predicts genes if needed -# Runs input against HMMER, DIAMOND, and dbCAN_sub -# Optionally predicts CGCs with CGCFinder -# Creats an overview table using output files from core -# tools from dbsub.out,hmmer.out and diamond.out if they exist. -########################################################## -from subprocess import Popen, call, check_output -import os -import argparse -import dbcan -from dbcan.utils.simplify_cgc import simplify_output -from dbcan.utils.CGCFinder import cgc_finder -from dbcan_cli import hmmer_parser -import time -from dbcan.utils.cgc_substrate_prediction import cgc_substrate_prediction - - -def runHmmScan(outPath, hmm_cpu, dbDir, hmm_eval, hmm_cov, db_name): - ''' - Run Hmmer - ''' - hmmer = Popen(['hmmsearch', '--domtblout', '%sh%s.out' % (outPath, db_name), '--cpu', hmm_cpu, '-o', '/dev/null', '%s%s.hmm' % (dbDir,db_name), '%suniInput' % outPath]) - hmmer.wait() - parsed_hmm_output = hmmer_parser.run(input_file=f"{outPath}h{db_name}.out", eval_num=hmm_eval, coverage=hmm_cov) - with open(f"{outPath}{db_name}.out", 'w') as f: - f.write(parsed_hmm_output) - if os.path.exists('%sh%s.out' % (outPath, db_name)): - call(['rm', '%sh%s.out' % (outPath, db_name)]) - -def split_uniInput(uniInput,dbcan_thread,outPath,dbDir,hmm_eval,hmm_cov, dbcan_offset): - ''' - Run dbcan_sub - ''' - ticks = time.time() - file = open(uniInput, "r") - uniInput_file = file.readlines() - file.close() - signal_count = 0 - split_size = 0 - min_files = dbcan_thread - check_id = False - file_number = None - split_files = [] - fsize = int(os.path.getsize(uniInput)/float(1024*1024) * dbcan_offset) - - if fsize < 1: - fsize = 1 - - for line in uniInput_file: - if ">" in line: - signal_count+=1 - print("ID count: %s" % signal_count) - - if signal_count >= min_files: - for i in range(fsize): - f = open("%s%s.txt"%(outPath,i),"w") - f.close() - split_files.append("%s.txt"%i) - for i in range(len(uniInput_file)): - if ">" in uniInput_file[i]: - file_number = i%fsize - f = open('%s%s.txt'%(outPath,file_number), 'a') - f.write(uniInput_file[i]) - f.close() - else: - f = open('%s%s.txt'%(outPath,file_number), 'a') - f.write(uniInput_file[i]) - f.close() - - ths = [] - for j in split_files: - ths.append(Popen(['hmmsearch', '--domtblout', '%sd%s'%(outPath,j), '--cpu', '5', '-o', '/dev/null', '%sdbCAN_sub.hmm'%dbDir, "%s%s"%(outPath,j)])) - for th in ths: - th.wait() - # fn = '%sd%s'%(outPath,j) - # dd = open(fn).readlines() - # # print(dd) - print(hmm_eval, hmm_cov) - for m in split_files: - hmm_parser_output = hmmer_parser.run("%sd%s"%(outPath,m), eval_num=hmm_eval, coverage=hmm_cov) - print(hmm_parser_output) - with open("%stemp_%s"%(outPath,m), 'w') as temp_hmmer_file: - temp_hmmer_file.write(hmm_parser_output) - call(['rm', '%sd%s'%(outPath,m)]) - call(['rm', '%s%s'%(outPath,m)]) #remove temporary files - - f = open("%sdtemp.out"%outPath,"w") - f.close() - - for n in split_files: - file_read = open("%stemp_%s"%(outPath,n),"r") - files_lines = file_read.readlines() - file_read.close() - call(['rm', "%stemp_%s"%(outPath,n)]) #remove temporary files - for j in range(len(files_lines)): - f = open("%sdtemp.out"%outPath,"a") - f.write(files_lines[j]) - f.close() - else: - dbsub = Popen(['hmmsearch', '--domtblout', '%sd.txt'%outPath, '--cpu', '5', '-o', '/dev/null', '%sdbCAN_sub.hmm'%dbDir, '%suniInput'%outPath]) - dbsub.wait() - - hmm_parser_output = hmmer_parser.run("%sd.txt"%outPath, eval_num=hmm_eval, coverage=hmm_cov) - with open("%sdtemp.out"%outPath, 'w') as temp_hmmer_file: - temp_hmmer_file.write(hmm_parser_output) - - print("total time:",time.time() - ticks) - -def run(inputFile, inputType, cluster=None, dbCANFile="dbCAN.txt", dia_eval=1e-102, dia_cpu=4, hmm_eval=1e-15, - hmm_cov=0.35, hmm_cpu=4, dbcan_thread=5, dbcan_offset=2, tf_eval=1e-4, tf_cov=0.35, tf_cpu=1, stp_eval=1e-4, stp_cov=0.3, stp_cpu=1, prefix="", - outDir="output", dbDir="db", cgc_dis=2, cgc_sig_genes="tp", tool_arg="all", use_signalP=False, - signalP_path="signalp", gram="all"): - ''' - Run dbCAN - ''' - - # Begin Setup and Input Checks - if not dbDir.endswith("/") and len(dbDir) > 0: - dbDir += "/" - - if not outDir.endswith("/") and len(outDir) > 0: - outDir += "/" - - outPath = outDir + prefix - auxFile = "" - - find_clusters = False - if cluster != None: - find_clusters = True - if inputType == "protein": - auxFile = cluster - else: - auxFile = '%sprodigal.gff'%outPath - - if not os.path.isdir(dbDir): - print(dbDir , "ERROR: The database directory does not exist") - exit() - - if not os.path.isfile(os.path.join(dbDir,'CAZy.dmnd')): - print("ERROR: No CAZy DIAMOND database found. \ - Please make sure that your CAZy DIAMOND databased is named 'CAZy.dmnd' and is located in your database directory") - exit() - - if not os.path.isfile(os.path.join(dbDir, dbCANFile)): - print("ERROR: No dbCAN HMM database found. \ - Please make sure that your dbCAN HMM database is named 'dbCAN-HMMdb-V11.txt' or the newest one, has been through hmmpress, and is located in your database directory") - exit() - - if not os.path.isfile(os.path.join(dbDir,'dbCAN_sub.hmm')): - print("ERROR: No dbCAN_sub HMM database found. \ - Please make sure that your dbCAN_sub HMM databased is named 'dbCAN_sub.hmm' or has been through hmmpress, and is located in your database directory") - exit() - - if not os.path.isdir(outDir): - call(['mkdir', outDir]) - - if find_clusters and inputType == "protein": - if len(auxFile) > 0: - print(auxFile) - if not os.path.isfile(auxFile): - print("ERROR: It seems that the auxillary filename that you provided does not exist, or is not a file") - exit() - else: - print("ERROR: Please provide an auxillary input file with the position of each gene. This file can either be in BED or GFF format") - exit() - - tools = [True, True, True] #DIAMOND, HMMER, dbCAN_sub - if 'all' not in tool_arg: - if 'diamond' not in tool_arg: - tools[0] = False - if 'hmmer' not in tool_arg: - tools[1] = False - if 'dbcansub' not in tool_arg: - tools[2] = False - - # End Setup and Input Checks - ######################### - ######################### - # Begin Gene Prediction Tools - if inputType == 'prok': - call(['prodigal', '-i', inputFile, '-a', '%suniInput'%outPath, '-o', '%sprodigal.gff'%outPath, '-f', 'gff', '-q']) - if inputType == 'meta': - call(['prodigal', '-i', inputFile, '-a', '%suniInput'%outPath, '-o', '%sprodigal.gff'%outPath, '-f', 'gff', '-p', 'meta','-q']) - #Proteome - if inputType == 'protein': - call(['cp', inputFile, '%suniInput'%outPath]) - - # End Gene Prediction Tools - ####################### - # Begin SignalP - if use_signalP: - print("\n\n***************************0. SIGNALP start*************************************************\n\n") - if gram == "p" or gram=="all": - signalpos = Popen('%s -t gram+ %suniInput > %ssignalp.pos' % (signalP_path, outPath, outPath), shell=True) - if gram == "n" or gram == "all": - signalpneg = Popen('%s -t gram- %suniInput > %ssignalp.neg' % (signalP_path, outPath, outPath), shell=True) - if gram == "euk" or gram=="all": - signalpeuk = Popen('%s -t euk %suniInput > %ssignalp.euk' % (signalP_path, outPath, outPath), shell=True) - - # End SignalP - ####################### - # Begin Core Tools - - if tools[0]: ### run diamond - # diamond blastp -d db/CAZy -e 1e-102 -q output_EscheriaColiK12MG1655/uniInput -k 1 -p 2 -o output_EscheriaColiK12MG1655/diamond1.out -f 6 - print("\n\n***************************1. DIAMOND start*************************************************\n\n") - os.system('diamond blastp -d %s -e %s -q %suniInput -k 1 -p %d -o %sdiamond.out -f 6'%(os.path.join(dbDir, "CAZy"), str(dia_eval), outPath, dia_cpu, outPath)) - print("\n\n***************************1. DIAMOND end***************************************************\n\n") - - if tools[1]: ### run hmmscan (hmmer) - print("\n\n***************************2. HMMER start*************************************************\n\n") - os.system(f"hmmsearch --domtblout {outPath}h.out --cpu {hmm_cpu} -o /dev/null {os.path.join(dbDir, dbCANFile)} {outPath}uniInput ") - print("\n\n***************************2. HMMER end***************************************************\n\n") - - hmm_parser_output = hmmer_parser.run(f"{outPath}h.out", eval_num=hmm_eval, coverage=hmm_cov) - with open(f"{outPath}hmmer.out", 'w') as hmmer_file: - hmmer_file.write(hmm_parser_output) - # could clean this up and manipulate hmm_parser_output data directly instead of passing it into a temp file - with open(f"{outPath}hmmer.out", "r+") as f: - text = f.read() - f.close() - call(['rm', f"{outPath}hmmer.out"]) - text = text.split('\n') - if '' in text: - text.remove('') - for i in range(len(text)): - if 'GT2_' in text[i]: - profile = text[i].split('\t')[0].split('.')[0] - text[i] = text[i].replace(profile,'GT2') - with open(f"{outPath}hmmer.out", 'a') as f: - f.write(text[i]+'\n') - f.close() - if os.path.exists(f"{outPath}h.out"): - call(['rm', f"{outPath}h.out"]) - - if tools[2]: - print("\n\n***************************3. dbCAN_sub start***************************************************\n\n") - split_uniInput('%suniInput'%outPath,dbcan_thread,outPath,dbDir,hmm_eval,hmm_cov, dbcan_offset) - print("\n\n***************************3. dbCAN_sub end***************************************************\n\n") - - with open(f"{outPath}dtemp.out", 'r') as f: - with open('%sdbsub.out'%outPath, 'w') as out: - for line in f: - row = line.rstrip().split('\t') - row.append(float(int(row[6])-int(row[5]))/int(row[1])) - if float(row[4]) <= 1e-15 and float(row[-1]) >= 0.35: - out.write('\t'.join([str(x) for x in row]) + '\n') - - with open(f"{outPath}dbsub.out", 'r+') as f: #formated GT2_ in hmmer.out - text = f.read() - f.close() - call(['rm', f"{outPath}dbsub.out"]) - text = text.split('\n') - if '' in text: - text.remove('') - for i in range(len(text)): - if 'GT2_' in text[i]: - profile = text[i].split('\t')[0].split('.')[0] - text[i] = text[i].replace(profile,'GT2') - with open(f"{outPath}dbsub.out", 'a') as f: - f.write(text[i]+'\n') - f.close() - # End Core Tools - ######################## - # Begin Parse Results - - # parse dbCAN_sub result - if tools[2]: - subs_dict = {} - with open(f"{dbDir}fam-substrate-mapping-08252022.tsv", 'r') as f: - next(f) - for line in f: - r = line.split("\t") - if len(r[4]) == 1: - subs_dict[r[2],"-"] = r[0] - else: - subs_dict[r[2],r[4].strip()] = r[0] - with open(f"{outPath}dbsub.out") as f: - with open(f"{outPath}temp", 'w') as out: - out.write('dbCAN subfam\tSubfam Composition\tSubfam EC\tSubstrate\tProfile Length\tGene ID\tGene Length\tE Value\tProfile Start\tProfile End\tGene Start\tGene End\tCoverage\n') - - for line in f: - - profile = line.split("\t") - subfam = [] - sub_composition = [] - sub_ec = [] - newline = [] - substrate = [] - key1 = "-" - key2 = ["-"] - - for p in profile[0].split("|"): - if ".hmm" in p: - subfam.append(p.split(".")[0]) - key1 = p.split(".")[0].split("_")[0] - elif len(p.split(".")) == 4: - sub_ec.append(p) - key2.append(p.split(":")[0]) - else: - sub_composition.append(p) - - for i in range(len(key2)): - try: - # print(key1,key2[i]) - substrate.append(subs_dict[key1,key2[i]]) - except: - print("No substrate for it") - - subfam = "|".join(subfam) - - if sub_composition: - sub_composition = "|".join(sub_composition) - else: - sub_composition = "-" - - if sub_ec: - sub_ec = "|".join(sub_ec) - else: - sub_ec = "-" - - if substrate: - substrate = ", ".join(substrate) - else: - substrate = "-" - - rest = "\t".join(profile[1:]) - - newline = subfam + "\t" + sub_composition + "\t" + sub_ec + "\t" + substrate + "\t" + rest - out.write(newline) - call(['mv', outDir+prefix+'temp', outDir+prefix+'dbsub.out']) - - # parse hmmer result - if tools[1]: - try: - with open(outDir+prefix+'hmmer.out') as f: - with open(outDir+prefix+'temp', 'w') as out: - out.write('HMM Profile\tProfile Length\tGene ID\tGene Length\tE Value\tProfile Start\tProfile End\tGene Start\tGene End\tCoverage\n') - for line in f: - out.write(line) - call(['mv', outDir+prefix+'temp', outDir+prefix+'hmmer.out']) - except: - with open(outDir+prefix+'temp', 'w') as out: - out.write('HMM Profile\tProfile Length\tGene ID\tGene Length\tE Value\tProfile Start\tProfile End\tGene Start\tGene End\tCoverage\n') - call(['mv', outDir+prefix+'temp', outDir+prefix+'hmmer.out']) - - # parse diamond result - if tools[0]: - with open(outDir+prefix+'diamond.out') as f: - with open(outDir+prefix+'temp', 'w') as out: - out.write('Gene ID\tCAZy ID\t% Identical\tLength\tMismatches\tGap Open\tGene Start\tGene End\tCAZy Start\tCAZy End\tE Value\tBit Score\n') - for line in f: - out.write(line) - call(['mv', outDir+prefix+'temp', outDir+prefix+'diamond.out']) - - # End Parse Results - ######################## - # Begin CGCFinder - - if find_clusters: ### run cgc_finder or not - print("*****************************CGC-Finder start************************************") - - ######################## - # Begin TF,TP, STP prediction - ''' - tf hmmer - ''' - #call(['diamond', 'blastp', '-d', dbDir+'tf_v1/tf.dmnd', '-e', '1e-10', '-q', '%suniInput' % outPath, '-k', '1', '-p', '1', '-o', outDir+prefix+'tf.out', '-f', '6']) - runHmmScan(outPath, str(tf_cpu), dbDir, str(tf_eval), str(tf_cov), "tf-1") - runHmmScan(outPath, str(tf_cpu), dbDir, str(tf_eval), str(tf_cov), "tf-2") - ''' - stp hmmer - ''' - runHmmScan(outPath, str(stp_cpu), dbDir, str(stp_eval), str(stp_cov), "stp") - - ''' - tp diamond - ''' - call(['diamond', 'blastp', '-d', dbDir+'tcdb.dmnd', '-e', '1e-10', '-q', '%suniInput' % outPath, '-k', '1', '-p', '1', '-o', outPath+'tp.out', '-f', '6']) - - - tp = set() - tf = set() - stp = set() - - tp_genes = {} - tf_genes = {} - stp_genes = {} - - with open("%stf-1.out" % outPath) as f: - for line in f: - row = line.rstrip().split('\t') - tf.add(row[2]) - row[0] = "DBD-Pfam|" + row[0] - if not row[2] in tf_genes: - tf_genes[row[2]] = row[0] - else: - tf_genes[row[2]] += ',' + row[0] - - with open("%stf-2.out" % outPath) as f: - for line in f: - row = line.rstrip().split('\t') - tf.add(row[2]) - row[0] = "DBD-SUPERFAMILY|" + row[0] - if not row[2] in tf_genes: - tf_genes[row[2]] = row[0] - else: - tf_genes[row[2]] += ',' + row[0] - - with open(outDir+prefix+'tp.out') as f: - for line in f: - row = line.rstrip().split('\t') - tp.add(row[0]) - if not row[0] in tp_genes: - tp_genes[row[0]] = row[1] - else: - tp_genes[row[0]] += ','+row[1] - - with open("%sstp.out" % outPath) as f: - for line in f: - row = line.rstrip().split('\t') - stp.add(row[2]) - row[0] = "STP|" + row[0] - if not row[2] in stp_genes: - stp_genes[row[2]] = row[0] - else: - stp_genes[row[2]] += ',' + row[0] - # End TF and TP prediction - ########################## - # Begine CAZyme Extraction - cazyme_genes = {} - - dia = set() - hmm = set() - dbs = set() - - if tools[0]: ### deal with diamond result - with open(outDir+prefix+'diamond.out') as f: - next(f) - for line in f: - row = line.rstrip().split('\t') - dia.add(row[0]) - if row[0] not in cazyme_genes: - cazyme_genes[row[0]] = set() - cazyme_genes[row[0]].update(set(row[1].strip("|").split('|')[1:])) - - if tools[1]: ### deal with hmmscan result - with open(outDir+prefix+'hmmer.out') as f: - next(f) - for line in f: - row = line.rstrip().split('\t') - hmm.add(row[2]) - if row[2] not in cazyme_genes: - cazyme_genes[row[2]] = set() - if row[0].split('.hmm')[0] in cazyme_genes[row[2]]: - cazyme_genes[row[2]].add(" "+row[0].split('.hmm')[0]) - else: - cazyme_genes[row[2]].add(row[0].split('.hmm')[0]) - - if tools[2]: ### deal with dbcan_sub result - with open(outDir+prefix+'dbsub.out') as f: - next(f) - for line in f: - row = line.rstrip().split('\t') - dbs.add(row[5]) - if row[5] not in cazyme_genes: - cazyme_genes[row[5]] = set() - cazyme_genes[row[5]].add(row[0]) - - - if tools.count(True) > 1: - temp1 = hmm.intersection(dbs) - # print(hmm, 'This intersection hmm') - temp2 = hmm.intersection(dia) - # print(dia, 'This intersection dia') - temp3 = dia.intersection(dbs) - # print(dbs, 'This intersection dbs') - cazyme = temp1.union(temp2, temp3) - else: - cazyme = hmm.union(dia, dbs) - # End CAZyme Extraction - ###################### - # Begin GFF preperation - - if inputType == "prok" or inputType == "meta": #use Prodigal GFF output - with open(outDir+prefix+'prodigal.gff') as f: - with open(outDir+prefix+'cgc.gff', 'w') as out: - for line in f: - if not line.startswith("#"): - row = line.rstrip().rstrip(";").split('\t') - num = row[-1].split(";")[0].split('_')[-1] - gene = row[0] + '_' + num - row[8] = "" - if gene in cazyme: - row[2] = "CAZyme" - # Uncomment this, if all CAZyme results need to be write into cgc.out - row[8] = "DB="+'|'.join(cazyme_genes[gene]) - # - # cazyme_genes_list = list(cazyme_genes[gene]) - # row[8] = "DB="+cazyme_genes_list[0] - # - elif gene in tf: - row[2] = "TF" - row[8] = "DB="+tf_genes[gene] - elif gene in tp: - row[2] = "TC" - row[8] = "DB="+tp_genes[gene] - elif gene in stp: - row[2] = "STP" - row[8] = "DB="+stp_genes[gene] - row[8] += ";ID="+gene - out.write('\t'.join(row)+'\n') - else: #user provided GFF/BED file - gff = False - with open(auxFile) as f: - for line in f: - if not line.startswith('#'): - if len(line.split('\t')) == 9: - gff = True - break - if gff: #user file was in GFF format - with open(auxFile) as f: - with open(outDir+prefix+'cgc.gff', 'w') as out: - for line in f: - if not line.startswith("#"): - row = line.rstrip().split('\t') - if row[2] == "CDS": - note = row[8].strip().rstrip(";").split(";") - gene = "" - notes = {} - for x in note: - temp = x.split('=') - notes[temp[0]] = temp[1] - # if "Name" in notes: - # gene = notes["Name"] - # elif "ID" in notes: - # gene = notes["ID"] - if "ID" in notes: - gene = notes["ID"] - else: - continue - - if gene in cazyme: - row[2] = "CAZyme" - # Uncomment this, if all CAZyme results need to be write into cgc.out - row[8] = "DB="+'|'.join(cazyme_genes[gene]) - # - # cazyme_genes_list = list(cazyme_genes[gene]) - # row[8] = "DB="+cazyme_genes_list[0] - # - elif gene in tf: - row[2] = "TF" - row[8] = "DB="+tf_genes[gene] - elif gene in tp: - row[2] = "TC" - row[8] = "DB="+tp_genes[gene] - elif gene in stp: - row[2] = "STP" - row[8] = "DB=" + stp_genes[gene] - else: - row[8] = "" - row[8] += ";ID="+gene - out.write('\t'.join(row)+'\n') - else: #user file was in BED format - with open(auxFile) as f: - with open(outDir+prefix+'cgc.gff', 'w') as out: - for line in f: - if line.startswith("track"): - continue - row = line.rstrip().rstrip(";").split('\t') - outrow = ['.'] * 8 + [''] - gene = row[1] - if gene in cazyme: - outrow[2] = 'CAZyme' - # Uncomment this, if all CAZyme results need to be write into cgc.out - outrow[8] = "DB="+'|'.join(cazyme_genes[gene]) - # - # cazyme_genes_list = list(cazyme_genes[gene]) - # outrow[8] = "DB="+cazyme_genes_list[0] - # - elif gene in tf: - outrow[2] = 'TF' - outrow[8] = "DB="+tf_genes[gene] - elif gene in tp: - outrow[2] = 'TC' - outrow[8] = "DB="+tp_genes[gene] - elif gene in stp: - outrow[2] = 'STP' - outrow[8] = "DB=" + stp_genes[gene] - else: - outrow[2] = 'CDS' - outrow[0] = row[0] - outrow[3] = row[2] - outrow[4] = row[3] - outrow[6] = row[4] - outrow[8] += ";ID="+gene - out.write('\t'.join(outrow)+'\n') - # End GFF - #################### - # Begin CGCFinder call - print("**************************************CGC-Finder start***********************************************") - # call(['CGCFinder.py', outDir+prefix+'cgc.gff', '-o', outDir+prefix+'cgc.out', '-s', args.cgc_sig_genes, '-d', str(args.cgc_dis)]) - cgc_finder(outDir+prefix+'cgc.gff', cgc_dis, cgc_sig_genes, outDir+prefix+'cgc.out') - simplify_output(outDir+prefix+'cgc.out') - print("**************************************CGC-Finder end***********************************************") - # End CGCFinder call - # End CGCFinder - #################### - # Begin SignalP combination - if use_signalP: ### signalP - print("Waiting on signalP") - with open(outDir+prefix+'temp', 'w') as out: - if gram == "all" or gram =="p": - signalpos.wait() - print("SignalP pos complete") - - with open(outDir+prefix+'signalp.pos') as f: - for line in f: - if not line.startswith('#'): - row = line.split(' ') - row = [x for x in row if x != ''] - if row[9] == 'Y': - out.write(line) - call(['rm', outDir+prefix+'signalp.pos']) - if gram == "all" or gram == "n": - signalpneg.wait() - print("SignalP neg complete") - with open(outDir+prefix+'signalp.neg') as f: - for line in f: - if not line.startswith('#'): - row = line.split(' ') - row = [x for x in row if x != ''] - if row[9] == 'Y': - out.write(line) - call(['rm', outDir+prefix+'signalp.neg']) - if gram == "all" or gram == "euk": - signalpeuk.wait() - print("SignalP euk complete") - with open(outDir+prefix+'signalp.euk') as f: - for line in f: - if not line.startswith('#'): - row = line.split(' ') - row = [x for x in row if x != ''] - if row[9] == 'Y': - out.write(line) - call(['rm', outDir+prefix+'signalp.euk']) - call('sort -u '+outDir+prefix+'temp > '+outDir+prefix+'signalp.out', shell=True) - call(['rm', outDir+prefix+'temp']) - - # End SignalP combination - ####################### - ####################### - # start Overview - print("Preparing overview table from hmmer, dbCAN_sub and diamond output...") - workdir = outDir+prefix - # a function to remove duplicates from lists while keeping original order - def unique(seq): - exists = set() - return [x for x in seq if not (x in exists or exists.add(x))] - - arr_dbsub = None - arr_hmmer = None - - # check if files exist. if so, read files and get the gene numbers - if tools[0]: - arr_diamond = open(workdir+"diamond.out").readlines() - diamond_genes = [arr_diamond[i].split()[0] for i in range(1, len(arr_diamond))] # or diamond_genes = [] - - if tools[1]: - arr_hmmer = open(workdir+"hmmer.out").readlines() - hmmer_genes = [arr_hmmer[i].split()[2] for i in range(1, len(arr_hmmer))] # or hmmer_genes = [] - - if tools[2]: - arr_dbsub = open(workdir+"dbsub.out").readlines() - dbsub_genes = [arr_dbsub[i].split("\t")[5] for i in range(1, len(arr_dbsub))]# or dbsub_genes = [] - - if use_signalP and (os.path.exists(workdir + "signalp.out")): - arr_sigp = open(workdir+"signalp.out").readlines() - sigp_genes = {} - for i in range (0,len(arr_sigp)): - row = arr_sigp[i].split() - sigp_genes[row[0]] = row[4] #previous one is row[2], use Y-score instead from suggestion of Dongyao Li - - ##Catie Ausland edits BEGIN, Le add variable exists or not, remove duplicates from input lists - if not tools[0]: - diamond_genes = [] - if not tools[1]: - hmmer_genes = [] - if not tools[2]: - dbsub_genes = [] - - if len(dbsub_genes) > 0: - if (dbsub_genes[-1] == None): - #print('I am in &&&&&&&&&&&&&&&&&&&&&&') - dbsub_genes.pop() - dbsub_genes = unique(dbsub_genes) - if 'hmmer_genes' in locals(): - hmmer_genes.pop() - hmmer_genes = unique(hmmer_genes) - if 'diamond_genes' in locals(): - diamond_genes.pop() - diamond_genes = unique(diamond_genes) - ## Catie edits END, Le add variable exists or not, remove duplicates from input lists - - # parse input, stroe needed variables - if tools[0] and (len(arr_diamond) > 1): - diamond_fams = {} - for i in range (1,len(arr_diamond)): - row = arr_diamond[i].split("\t") - fam = row[1].strip("|").split("|") - diamond_fams[row[0]] = fam[1:] - - if tools[1] and (len(arr_hmmer) > 1): - hmmer_fams = {} - for i in range (1, len(arr_hmmer)): - row = arr_hmmer[i].split("\t") - fam = row[0].split(".") - fam = fam[0]+"("+row[7]+"-"+row[8]+")" - if(row[2] not in hmmer_fams): - hmmer_fams[row[2]] = [] - hmmer_fams[row[2]].append(fam) - - if tools[2] and (len(arr_dbsub) > 1) : - dbsub_fams = {} - for i in range (1,len(arr_dbsub)): - row_ori = arr_dbsub[i].split("\t") - fams_ID = row_ori[5] - if fams_ID not in dbsub_fams: - dbsub_fams[fams_ID] = {} - dbsub_fams[fams_ID]["fam_name"] = [] - dbsub_fams[fams_ID]["ec_num"] = [] - - dbsub_fams[fams_ID]["fam_name"].append(row_ori[0]) - dbsub_fams[fams_ID]["ec_num"].append(row_ori[2]) - - #overall table - - all_genes = unique(hmmer_genes+dbsub_genes+diamond_genes) - - with open(workdir+"overview.txt", 'w+') as fp: - if use_signalP: - fp.write("Gene ID\tEC#\tHMMER\tdbCAN_sub\tDIAMOND\tSignalp\t#ofTools\n") - else: - fp.write("Gene ID\tEC#\tHMMER\tdbCAN_sub\tDIAMOND\t#ofTools\n") - for gene in all_genes: - csv=[gene] - num_tools = 0 - - if tools[2] and arr_dbsub != None and (gene in dbsub_genes): - if dbsub_fams[gene]["ec_num"] == []: - csv.append("-") - else: - csv.append("|".join(dbsub_fams[gene]["ec_num"])) - else: - csv.append("-") - - if tools[1] and arr_hmmer != None and (gene in hmmer_genes): - num_tools += 1 - csv.append("+".join(hmmer_fams[gene])) - else: - csv.append("-") - - if tools[2] and arr_dbsub!= None and (gene in dbsub_genes): - num_tools += 1 - csv.append("+".join(dbsub_fams[gene]["fam_name"])) - else: - csv.append("-") - - if tools[0] and arr_diamond != None and (gene in diamond_genes): - num_tools += 1 - csv.append("+".join(diamond_fams[gene])) - else: - csv.append("-") - if use_signalP: - if (gene in sigp_genes): - csv.append("Y(1-"+sigp_genes[gene]+")") - else: - csv.append("N") - csv.append(str(num_tools)) - temp = "\t".join(csv) + "\n" - fp.write(temp) - print("overview table complete. Saved as "+workdir+"overview.txt") - # End overview - - -# Putting the ArgumentParser in this block allows the script to be called from command line as before, while -# allowing the main function to be called directly from other scripts without invoking a subprocess. This prevents extra -# subprocesses or extra python interpreters being spawned, as well as simplifying python scripts which call run_dbcan. -def cli_main(): - ''' - example command: - 1. CAZyme annotation with isolated genome sequence as input - run_dbcan EscheriaColiK12MG1655.fna prok - 2. CAZyme annotation with isolated protein sequence as input - run_dbcan EscheriaColiK12MG1655.faa protein - 3. CAZyme annotation with meta genome as input - run_dbcan EscheriaColiK12MG1655.fna meta - 4. CAZyme and CGC annotation with mete genome as input - run_dbcan EscheriaColiK12MG1655.fna meta -c EscheriaColiK12MG1655.gff - 5. CAZyme, CGC annotation and substrate prediction with mete genome as input - run_dbcan EscheriaColiK12MG1655.fna meta -c EscheriaColiK12MG1655.gff --cgc_substrate - ''' - parser = argparse.ArgumentParser(description='dbCAN4 Driver Script') - parser.add_argument('inputFile', help='User input file. Must be in FASTA format.') - parser.add_argument('inputType', choices=['protein', 'prok', 'meta'], #protein=proteome, prok=prokaryote nucleotide, meta=metagenome nucleotide - help='Type of sequence input. protein=proteome; prok=prokaryote; meta=metagenome') - parser.add_argument('--dbCANFile',default="dbCAN.txt", help='Indicate the file name of HMM database such as dbCAN.txt, please use the newest one from dbCAN2 website.') - parser.add_argument('--dia_eval', default=1e-102,type=float, help='DIAMOND E Value') - parser.add_argument('--dia_cpu', default=4, type=int, help='Number of CPU cores that DIAMOND is allowed to use') - parser.add_argument('--hmm_eval', default=1e-15, type=float, help='HMMER E Value') - parser.add_argument('--hmm_cov', default=0.35, type=float, help='HMMER Coverage val') - parser.add_argument('--hmm_cpu', default=4, type=int, help='Number of CPU cores that HMMER is allowed to use') - parser.add_argument('--out_pre', default="", help='Output files prefix') - parser.add_argument('--out_dir', default="output", help='Output directory') - parser.add_argument('--db_dir', default="db", help='Database directory') - parser.add_argument('--tools', '-t', nargs='+', choices=['hmmer', 'diamond', 'dbcansub', 'all'], default='all', help='Choose a combination of tools to run') - parser.add_argument('--use_signalP', default=False, type=bool, help='Use signalP or not, remember, you need to setup signalP tool first. Because of signalP license, Docker version does not have signalP.') - parser.add_argument('--signalP_path', '-sp',default="signalp", type=str, help='The path for signalp. Default location is signalp') - parser.add_argument('--gram', '-g', choices=["p","n","all"], default="all", help="Choose gram+(p) or gram-(n) for proteome/prokaryote nucleotide, which are params of SingalP, only if user use singalP") - parser.add_argument('-v', '--version',default="3.0.0", type=str) - # dbCAN-sub - dbCAN_sub_group = parser.add_argument_group('dbCAN-sub parameters') - dbCAN_sub_group.add_argument('--dbcan_thread', '-dt', default=5,type=int) - dbCAN_sub_group.add_argument('--tf_eval', default=1e-4, type=float, help='tf.hmm HMMER E Value') - dbCAN_sub_group.add_argument('--tf_cov', default=0.35, type=float, help='tf.hmm HMMER Coverage val') - dbCAN_sub_group.add_argument('--tf_cpu', default=1, type=int, help='tf.hmm Number of CPU cores that HMMER is allowed to use') - dbCAN_sub_group.add_argument('--stp_eval', default=1e-4, type=float, help='stp.hmm HMMER E Value') - dbCAN_sub_group.add_argument('--stp_cov', default=0.3, type=float, help='stp.hmm HMMER Coverage val') - dbCAN_sub_group.add_argument('--stp_cpu', default=1, type=int, help='stp.hmm Number of CPU cores that HMMER is allowed to use') - - ### cgc finder - cgcfinder_group = parser.add_argument_group('CGC_Finder parameters') - cgcfinder_group.add_argument('--cluster', '-c', help='Predict CGCs via CGCFinder. This argument requires an auxillary locations file if a protein input is being used') - cgcfinder_group.add_argument('--cgc_dis', default=2, type=int, help='CGCFinder Distance value') - cgcfinder_group.add_argument('--cgc_sig_genes', default='tp', choices=['tf', 'tp', 'stp', 'tp+tf', 'tp+stp', 'tf+stp', 'all'], help='CGCFinder Signature Genes value') - - ### cgc substrate prediction - cgcsubstrate_group = parser.add_argument_group('CGC_Substrate parameters') - cgcsubstrate_group.add_argument('--cgc_substrate',action='store_true',help="run cgc substrate prediction?") - cgcsubstrate_group.add_argument('--pul',help="dbCAN-PUL PUL.faa") - cgcsubstrate_group.add_argument('-o','--out',default="sub.prediction.out") - cgcsubstrate_group.add_argument('-w','--workdir',type=str,default=".") - cgcsubstrate_group.add_argument('-env','--env',type=str,default="local") - cgcsubstrate_group.add_argument('-odbsub','--odbsub',action='store_true',help="out dbcan_sub prediction intermediate result?") - cgcsubstrate_group.add_argument('-odbcanpul','--odbcanpul',action='store_true',help="output dbCAN-PUL prediction intermediate result?") - - ### cgc substrate prediction:dbCAN-PUL - group1 = parser.add_argument_group('dbCAN-PUL homologous searching parameters', 'how to define homologous gene hits and PUL hits') - group1.add_argument('-upghn','--uniq_pul_gene_hit_num',default = 2,type=int) - group1.add_argument('-uqcgn','--uniq_query_cgc_gene_num',default = 2,type=int) - group1.add_argument('-cpn','--CAZyme_pair_num',default = 1,type=int) - group1.add_argument('-tpn','--total_pair_num',default = 2,type=int) - group1.add_argument('-ept','--extra_pair_type',default = None,type=str,help="None[TC-TC,STP-STP]. Some like sigunature hits") - group1.add_argument('-eptn','--extra_pair_type_num',default ="0",type=str,help="specify signature pair cutoff.1,2") - group1.add_argument('-iden','--identity_cutoff',default = 0.3,type=float,help="identity to identify a homologous hit") - group1.add_argument('-cov','--coverage_cutoff',default = 0.3,type=float,help="query coverage cutoff to identify a homologous hit") - group1.add_argument('-bsc','--bitscore_cutoff',default = 50,type=float,help="bitscore cutoff to identify a homologous hit") - group1.add_argument('-evalue','--evalue_cutoff',default = 0.01,type=float,help="evalue cutoff to identify a homologous hit") - - ### cgc substrate prediction:dbCAN-sub - group2 = parser.add_argument_group('dbCAN-sub major voting parameters', 'how to define dbsub hits and dbCAN-sub subfamily substrate') - group2.add_argument('-hmmcov','--hmmcov',default = 0.3,type=float) - group2.add_argument('-hmmevalue','--hmmevalue',default = 0.01,type=float) - group2.add_argument('-ndsc','--num_of_domains_substrate_cutoff',default = 2,type=int,help="define how many domains share substrates in a CGC, one protein may include several subfamily domains.") - group2.add_argument('-npsc','--num_of_protein_substrate_cutoff',default = 2,type=int,help="define how many sequences share substrates in a CGC, one protein may include several subfamily domains.") - group2.add_argument('-subs','--substrate_scors',default = 2,type=int,help="each cgc contains with substrate must more than this value") - - args = parser.parse_args() - - ### rundbCAN3 - run(inputFile=args.inputFile, inputType=args.inputType, cluster=args.cluster, dbCANFile=args.dbCANFile, - dia_eval=args.dia_eval, dia_cpu=args.dia_cpu, hmm_eval=args.hmm_eval, hmm_cov=args.hmm_cov, - hmm_cpu=args.hmm_cpu, dbcan_thread=args.dbcan_thread, tf_eval=args.tf_eval, tf_cov=args.tf_cov, tf_cpu=args.tf_cpu, - stp_eval=args.stp_eval, stp_cov=args.stp_cov, stp_cpu=args.stp_cpu, prefix=args.out_pre, outDir=args.out_dir, - dbDir=args.db_dir, cgc_dis=args.cgc_dis, cgc_sig_genes=args.cgc_sig_genes, tool_arg=args.tools, - use_signalP=args.use_signalP, signalP_path=args.signalP_path, gram=args.gram) - - ### convert cgc_standard.out to json format - - if args.cluster: ### run cgc_finder - os.system(f"cgc_standard2json -i {args.out_dir}/cgc_standard.out -o {args.out_dir}/cgc_standard.out.json") - ### substarate prediction - if args.cgc_substrate: - cgc_substrate_prediction(args) - -if __name__ == '__main__': - cli_main() \ No newline at end of file diff --git a/docs/Makefile b/docs/Makefile new file mode 100644 index 000000000..d4bb2cbb9 --- /dev/null +++ b/docs/Makefile @@ -0,0 +1,20 @@ +# Minimal makefile for Sphinx documentation +# + +# You can set these variables from the command line, and also +# from the environment for the first two. +SPHINXOPTS ?= +SPHINXBUILD ?= sphinx-build +SOURCEDIR = . +BUILDDIR = _build + +# Put it first so that "make" without argument is like "make help". +help: + @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) + +.PHONY: help Makefile + +# Catch-all target: route all unknown targets to Sphinx using the new +# "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). +%: Makefile + @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) diff --git a/docs/_static/img/Picture1.png b/docs/_static/img/Picture1.png new file mode 100644 index 000000000..863be29e0 Binary files /dev/null and b/docs/_static/img/Picture1.png differ diff --git a/docs/_static/img/dbCAN3_logo.png b/docs/_static/img/dbCAN3_logo.png new file mode 100644 index 000000000..3bdce5fe6 Binary files /dev/null and b/docs/_static/img/dbCAN3_logo.png differ diff --git a/docs/_static/img/fig3.jpg b/docs/_static/img/fig3.jpg new file mode 100644 index 000000000..f09f5a731 Binary files /dev/null and b/docs/_static/img/fig3.jpg differ diff --git a/docs/api/index.rst b/docs/api/index.rst new file mode 100644 index 000000000..081957f99 --- /dev/null +++ b/docs/api/index.rst @@ -0,0 +1,7 @@ +API +=== + +.. argparse:: + :module: dbcan.cli.run_dbcan + :func: rundbCAN_parser + :prog: run_dbcan -h diff --git a/docs/change-logs/2.0.11.rst b/docs/change-logs/2.0.11.rst new file mode 100644 index 000000000..0eed5ba2f --- /dev/null +++ b/docs/change-logs/2.0.11.rst @@ -0,0 +1,15 @@ +2.0.11 +~~~~~~ + +.. rubric:: Feature + +#. Add ec number prediction to ``Hotpep`` result. +#. dbCAN2 Hotpep PPR patterns updated to most recent release of ``CAZyDB (2019)``. Also missing group ``EC#`` files for families added in. +#. Revising the output name file in ``Hotpep`` as random number, to make the program more robust for paralleing running. +#. Prioritizing the CAZyme prediction over ``TF/TC/STP`` prediction when preparing gff input file for ``CGC-Finder``. +#. Rewrite the installation steps. Recommend user to use customized virtual environment (use certain ``Python`` version). + +.. rubric:: BugFix + +#. Fix the duplicate ACC of ``tf-2.hmm`` to make it compatiable for the newest ``hmmer(3.3)``. Delete version limitatino to hmmer. +#. Fix the ``prodigal`` predicted ``gff`` file end with ``";"`` problem. diff --git a/docs/change-logs/2.0.6.rst b/docs/change-logs/2.0.6.rst new file mode 100644 index 000000000..60c9ecde6 --- /dev/null +++ b/docs/change-logs/2.0.6.rst @@ -0,0 +1,8 @@ +2.0.6 (02/17/2020) +~~~~~~~~~~~~~~~~~~ + +.. rubric:: Feature + +#. Please use ``pip install run-dbcan==2.0.6`` for update. +#. We forgo ``FragGeneScan`` because this program does not work now. We use ``Prodigal`` instead. So, you don't need to use ``conda install fraggenescan`` now. +#. We add extra information ``EC number`` to the last columns of ``Hotpep`` output to make it consistent with previous ``Hotpep`` result. diff --git a/docs/change-logs/3.0.2.rst b/docs/change-logs/3.0.2.rst new file mode 100644 index 000000000..452970bb6 --- /dev/null +++ b/docs/change-logs/3.0.2.rst @@ -0,0 +1,19 @@ +3.0.2 +~~~~~ + +.. rubric:: Feature + +#. Added the `eCAMI `_ tool and removed `Hotpep`. +#. Changed the format of ``cgc.out`` to make it easy to read. +#. Re-arranged the code. The command line is ``run_dbcan`` now without the suffix ``.py``. +#. The database files under db folder is now hosted through ``Git-LFS``. To download the complete db files, please use the command ``git lfs pull`` + (Note: To use git lfs, please check the installation guide at https://git-lfs.github.com/). +#. ``Dockerfile`` is updated and we simplify the installation step. + +.. rubric:: BugFix + +- Fixed ``-t all`` which will cause the program to generate empty files. + +.. rubric:: Regular Updates + +- Updated ``CAZy db`` for ``Diamond`` and ``HMMER DB`` for ``HMMER``. diff --git a/docs/change-logs/3.0.4.rst b/docs/change-logs/3.0.4.rst new file mode 100644 index 000000000..ea58df6d0 --- /dev/null +++ b/docs/change-logs/3.0.4.rst @@ -0,0 +1,12 @@ +3.0.4 +~~~~~ + + +.. rubric:: Updates + +- Please use ``pip install dbcan==3.0.4`` for update. + +.. rubric:: BugFix + +#. Fixed the `eCAMI `_ tool problem. +#. If you download gff file from `NCBI `_, please check the last column, replace Name with ID, and ID with Name. diff --git a/docs/change-logs/3.0.5.rst b/docs/change-logs/3.0.5.rst new file mode 100644 index 000000000..95856f15c --- /dev/null +++ b/docs/change-logs/3.0.5.rst @@ -0,0 +1,13 @@ +3.0.5 +~~~~~ + + +.. rubric:: Feature + +- Please use ``pip install dbcan==3.0.5`` for updates. + + +.. rubric:: BugFix + +#. Fixed the bug in ``signalP``; +#. Fixed the ``cgc`` and ``run_dbcan`` small bugs. diff --git a/docs/change-logs/3.0.6.rst b/docs/change-logs/3.0.6.rst new file mode 100644 index 000000000..48ee87581 --- /dev/null +++ b/docs/change-logs/3.0.6.rst @@ -0,0 +1,17 @@ +3.0.6 +~~~~~ + + +.. rubric:: Feature + +- We now create the ``bioconda`` version. To old users, please run ``conda install dbcan -c bioconda`` for update from now on. + To new users, please follow the installation instruction below. + + .. code-block:: shell + + conda create -n run_dbcan python=3.8 dbcan -c conda-forge -c bioconda + conda activate run_dbcan + +.. rubric:: Updates + +- Rename the code ``hmmscan-parser.py`` to ``hmmscan_parser.py``. diff --git a/docs/change-logs/3.0.7.rst b/docs/change-logs/3.0.7.rst new file mode 100644 index 000000000..db979c99a --- /dev/null +++ b/docs/change-logs/3.0.7.rst @@ -0,0 +1,10 @@ +3.0.7 +~~~~~ + +.. rubric:: BugFix + +- Fix the bug in ``cgc_parser.py``. + +.. rubric:: Regular Updates + +- ``hmmdb``, ``cazydb``, ``tf-1``, ``tf-2``, ``stp`` and ``tcdb`` are updated. diff --git a/docs/change-logs/4.0.0.rst b/docs/change-logs/4.0.0.rst new file mode 100644 index 000000000..84258b7cd --- /dev/null +++ b/docs/change-logs/4.0.0.rst @@ -0,0 +1,16 @@ +4.0.0 +~~~~~ + +.. rubric:: Features + +#. CAZyme substrate prediction based on dbCAN-sub; +#. CGC substrate prediction based on dbCAN-PUL searching and `dbCAN-sub `_ majority voting. For CGC substrate prediction, please see our `dbCAN-seq update paper `_ for details. With these new functions (esp. the dbCAN-sub search), run_dbcan 4.0 is now slower to get the result back to you. Please be patient! +#. See :issue:`127` for some explanation of different output files for substrate predictions. + +.. rubric:: BugFix + +- Please split your files if your input contains > 1 million proteins. + +.. rubric:: Regular Updates + +- All the databases are updated. diff --git a/docs/change-logs/index.rst b/docs/change-logs/index.rst new file mode 100644 index 000000000..e731a40bb --- /dev/null +++ b/docs/change-logs/index.rst @@ -0,0 +1,25 @@ +.. _change-logs: + +Change Logs +=========== + +Version 4.0 +----------- + +.. include:: /change-logs/4.0.0.rst + + +Version 3.0 +----------- + +.. include:: /change-logs/3.0.7.rst +.. include:: /change-logs/3.0.6.rst +.. include:: /change-logs/3.0.5.rst +.. include:: /change-logs/3.0.4.rst +.. include:: /change-logs/3.0.2.rst + +Version 2.0 +----------- + +.. include:: /change-logs/2.0.11.rst +.. include:: /change-logs/2.0.6.rst diff --git a/docs/conf.py b/docs/conf.py new file mode 100644 index 000000000..e1566af55 --- /dev/null +++ b/docs/conf.py @@ -0,0 +1,148 @@ +# Configuration file for the Sphinx documentation builder. +# +# This file only contains a selection of the most common options. For a full +# list see the documentation: +# https://www.sphinx-doc.org/en/master/usage/configuration.html + +# -- Path setup -------------------------------------------------------------- +import sys +from datetime import datetime +from importlib.metadata import metadata +from pathlib import Path + +HERE = Path(__file__).parent +sys.path.insert(0, str(HERE / "extensions")) + + +# -- Project information ----------------------------------------------------- + +# NOTE: If you installed your project in editable mode, this might be stale. +# If this is the case, reinstall it to refresh the metadata +info = metadata("dbcan") +project_name = info["Name"] +author = info["Author"] +copyright = f"{datetime.now():%Y}, {author}." +version = info["Version"] +urls = dict(pu.split(", ") for pu in info.get_all("Project-URL")) +repository_url = urls["Source"] + +# The full version, including alpha/beta/rc tags +release = info["Version"] + +bibtex_bibfiles = ["references.bib"] +templates_path = ["_templates"] +nitpicky = True # Warn about broken links +needs_sphinx = "4.0" + +html_context = { + "display_github": True, # Integrate GitHub + "github_user": "haidyi", # Username + "github_repo": project_name, # Repo name + "github_version": "main", # Version + "conf_py_path": "/docs/", # Path in the checkout to the docs root +} + +# -- General configuration --------------------------------------------------- + +# Add any Sphinx extension module names here, as strings. +# They can be extensions coming with Sphinx (named 'sphinx.ext.*') or your custom ones. +extensions = [ + "myst_nb", + "sphinx_copybutton", + "sphinx.ext.autodoc", + "sphinx.ext.intersphinx", + "sphinx.ext.autosummary", + "sphinx.ext.napoleon", + "sphinxcontrib.bibtex", + "sphinx_autodoc_typehints", + "sphinx.ext.mathjax", + "IPython.sphinxext.ipython_console_highlighting", + "sphinx_issues", + "sphinxext.opengraph", + *[p.stem for p in (HERE / "extensions").glob("*.py")], + "sphinx_design", + "sphinxarg.ext", +] + +issues_github_path = "linnabrown/run_dbcan" + +autosummary_generate = True +autodoc_member_order = "groupwise" +default_role = "literal" +napoleon_google_docstring = False +napoleon_numpy_docstring = True +napoleon_include_init_with_doc = False +napoleon_use_rtype = True # having a separate entry generally helps readability +napoleon_use_param = True +myst_heading_anchors = 6 # create anchors for h1-h6 +myst_enable_extensions = [ + "amsmath", + "colon_fence", + "deflist", + "dollarmath", + "html_image", + "html_admonition", +] +myst_url_schemes = ("http", "https", "mailto") +nb_output_stderr = "remove" +nb_execution_mode = "off" +nb_merge_streams = True +typehints_defaults = "braces" + +source_suffix = { + ".rst": "restructuredtext", + ".ipynb": "myst-nb", + ".myst": "myst-nb", +} + +intersphinx_mapping = { + "python": ("https://docs.python.org/3", None), + "anndata": ("https://anndata.readthedocs.io/en/stable/", None), + "numpy": ("https://numpy.org/doc/stable/", None), +} + +# List of patterns, relative to source directory, that match files and +# directories to ignore when looking for source files. +# This pattern also affects html_static_path and html_extra_path. +exclude_patterns = ["_build", "Thumbs.db", ".DS_Store", "**.ipynb_checkpoints"] + + +# -- Options for HTML output ------------------------------------------------- + +# The theme to use for HTML and HTML Help pages. See the documentation for +# a list of builtin themes. +# +html_theme = "sphinx_book_theme" +html_static_path = ["_static"] +html_title = project_name +html_logo = "_static/img/dbCAN3_logo.png" + +html_theme_options = { + "repository_url": repository_url, + "use_repository_button": True, + "path_to_docs": "docs/", + "navigation_with_keys": False, +} + +pygments_style = "default" + +nitpick_ignore = [ + # If building the documentation fails because of a missing link that is outside your control, + # you can add an exception to this list. + # ("py:class", "igraph.Graph"), +] + + +def setup(app): + """App setup hook.""" + app.add_config_value( + "recommonmark_config", + { + "auto_toc_tree_section": "Contents", + "enable_auto_toc_tree": True, + "enable_math": True, + "enable_inline_math": False, + "enable_eval_rst": True, + }, + True, + ) diff --git a/docs/contributors.rst b/docs/contributors.rst new file mode 100644 index 000000000..9f9868d80 --- /dev/null +++ b/docs/contributors.rst @@ -0,0 +1,20 @@ +Contributors +============ + +Current Developers +------------------ + +- `Yanbin Yin `_, associated professor at `UNL `_, lead developer +- `Le Huang `_, PhD candidate at `UNC-Chapel Hill `_., `dbCAN2 `_ developer, + `run_dbcan `_ maintainer +- Jinfang Zheng, Post Doctoral Researcher at `Yin Lab `_ (now at `ZHEJIANG LAB `_), + `dbCAN3 `_ developer. +- Haidong Yi, PhD candidate at `UNC-CS `_, developer, diverse contributions. + +Former Developers +----------------- + +- Qiwei Ge, UNL CSE graduate student (now software engineer in Xi'an China). +- Catie Ausland, PhD student at `NIU `_ (now Data Scientist of NIH Genomic Data Commons at U of Chicago). +- Tanner Yohe, Ungraduate Student of `NIU `_ (now software engineer in Alabama). +- Han Zhang, Professor at `NKU `_. diff --git a/docs/extensions/sphinx_issues.py b/docs/extensions/sphinx_issues.py new file mode 100644 index 000000000..fa76a04b8 --- /dev/null +++ b/docs/extensions/sphinx_issues.py @@ -0,0 +1,541 @@ +"""A Sphinx extension for linking to your project's issue tracker.""" +import re +from typing import Callable, Optional, Tuple + +from docutils import nodes, utils +from sphinx.config import Config +from sphinx.util.nodes import split_explicit_title + +__version__ = "3.0.1" +__author__ = "Steven Loria" +__license__ = "MIT" + + +def cve_role(name, rawtext, text, lineno, inliner, options=None, content=None): + """Sphinx role for linking to a CVE on https://cve.mitre.org. + + Examples: :: + + :cve:`CVE-2018-17175` + + """ + options = options or {} + content = content or [] + has_explicit_title, title, target = split_explicit_title(text) + + target = utils.unescape(target).strip() + title = utils.unescape(title).strip() + ref = f"https://cve.mitre.org/cgi-bin/cvename.cgi?name={target}" + text = title if has_explicit_title else target + link = nodes.reference(text=text, refuri=ref, **options) + return [link], [] + + +GITHUB_USER_RE = re.compile("^https://github.com/([^/]+)/([^/]+)/.*") + + +def _get_default_group_and_project(config: Config, uri_config_option: str) -> Optional[Tuple[str, str]]: + """ + Retrieves the default group and project names from the configuration. + + This function extracts and returns the default group and project names based on the configuration settings. It supports both legacy and new configuration options, raising an error if both are defined. + + Parameters + ---------- + config (Config): The configuration object containing the settings. + uri_config_option (str): The URI configuration option name. + + Returns + ------- + Optional[Tuple[str, str]]: A tuple containing the group and project names, or None if not set. + + Raises + ------ + ValueError: If both old and new configuration options are set, or if the group/project format is incorrect. + """ + old_config = getattr(config, "issues_github_path", None) + new_config = getattr(config, "issues_default_group_project", None) + + if old_config and new_config: + raise ValueError( + "Both 'issues_github_path' and 'issues_default_group_project' are set, even" + " though they define the same setting. " + "Please only define one of these." + ) + group_and_project = new_config or old_config + + if group_and_project: + assert isinstance(group_and_project, str) + try: + group, project = group_and_project.split("/", maxsplit=1) + return group, project + except ValueError as e: + raise ValueError( + "`issues_github_path` or `issues_default_group_project` needs to " + "define a value in the form of `/` " + f"but `{config}` was given." + ) from e + + # If group and project was not set, we need to look for it within the github url + # for backward compatibility + if not group_and_project: + uri = getattr(config, uri_config_option) + if uri: + match = GITHUB_USER_RE.match(uri) + if match: + return match.groups()[0], match.groups()[1] + + return None + + +def _get_placeholder(uri_config_option: str) -> str: + """ + Extracts a placeholder value from a URI configuration option. + + This function processes a URI configuration option name to extract a meaningful placeholder. The placeholder is typically a part of the configuration option name. + + Parameters + ---------- + uri_config_option (str): The URI configuration option name. + + Returns + ------- + str: The extracted placeholder string. + + Note: + The function handles different naming conventions for the configuration options. + """ + try: + # i.e. issues_pr_uri -> pr + return uri_config_option[:-4].split("_", maxsplit=1)[1] + except IndexError: + # issues_uri -> issue + return uri_config_option[:-5] + + +def _get_uri_template( + config: Config, + uri_config_option: str, +) -> str: + """ + Get a URL format template that can be filled with user information based on the given configuration + + The result always contains the following placeholder + - n (the issue number, user, pull request, etc...) + + The result can contain the following other placeholders + - group (same as user in github) + - project + + Examples for possible results: + + - "https://github.com/{group}/{project}/issues/{n}" + + - "https://gitlab.company.com/{group}/{project}/{n}" + + - "https://fancy.issuetrack.com?group={group}&project={project}&issue={n}" + + Raises + ------ + - ValueError if the given uri contains an invalid placeholder + """ + format_string = str(getattr(config, uri_config_option)) + placeholder = _get_placeholder(uri_config_option) + + result = format_string.replace(f"{{{placeholder}}}", "{n}") + + try: + result.format(project="", group="", n="") + except (NameError, KeyError) as e: + raise ValueError( + f"The `{uri_config_option}` option contains invalid placeholders. " + f"Only {{group}}, {{projects}} and {{{placeholder}}} are allowed." + f'Invalid format string: "{format_string}".' + ) from e + return result + + +def _get_uri( + uri_config_option: str, + config: Config, + number: str, + group_and_project: Optional[Tuple[str, str]] = None, +) -> str: + """ + Constructs a URI based on configuration options and provided parameters. + + This function generates a URI using a format string obtained from the configuration, replacing placeholders with actual values provided as parameters or from the configuration. It supports backward compatibility by allowing replacement of default group/project in the format string. + + Parameters + ---------- + uri_config_option (str): The configuration option name that specifies the URI format. + config (Config): The configuration object containing settings and format strings. + number (str): A string, typically a number, to be included in the URI. + group_and_project (Optional[Tuple[str, str]]): A tuple containing the group and project names. If not provided, the default from the configuration is used. + + Returns + ------- + str: The constructed URI based on the provided information and configuration. + + Raises + ------ + ValueError: If the format string requires a group/project to be defined, and it is not provided in the function call or configuration. + """ + format_string = _get_uri_template(config, uri_config_option) + + url_vars = {"n": number} + + config_group_and_project = _get_default_group_and_project(config, uri_config_option) + if group_and_project: + # Group and Project defined by call + if config_group_and_project: + to_replace = "/".join(config_group_and_project) + if to_replace in format_string: + # Backward compatibility, replace default group/project + # with {group}/{project} + format_string = format_string.replace(to_replace, "{group}/{project}") + (url_vars["group"], url_vars["project"]) = group_and_project + elif config_group_and_project: + # If not defined by call use the default if given + (url_vars["group"], url_vars["project"]) = config_group_and_project + + try: + return format_string.format(**url_vars) + except (NameError, KeyError) as e: + # The format string was checked before, that it contains no additional not + # supported placeholders. So this occur + raise ValueError( + f"The `{uri_config_option}` format `{format_string}` requires a " + f"group/project to be defined in `issues_default_group_project`." + ) from e + + +def cwe_role(name, rawtext, text, lineno, inliner, options=None, content=None): + """Sphinx role for linking to a CWE on https://cwe.mitre.org. + + Examples: :: + + :cwe:`CWE-787` + + """ + options = options or {} + content = content or [] + has_explicit_title, title, target = split_explicit_title(text) + + target = utils.unescape(target).strip() + title = utils.unescape(title).strip() + number = target[4:] + ref = f"https://cwe.mitre.org/data/definitions/{number}.html" + text = title if has_explicit_title else target + link = nodes.reference(text=text, refuri=ref, **options) + return [link], [] + + +class IssueRole: + """ + A class for formatting and linking issues, pull requests, merge requests, and commits. + + This class handles the generation of links to issues, pull requests, merge requests, and commits based on a configuration prefix and optionally provided text formatting. It supports both internal and external repository references. + + Attributes + ---------- + ELEMENT_SEPARATORS (str): Symbols used to separate elements in references. + EXTERNAL_REPO_REGEX (re.Pattern): Regular expression for matching external repository references. + + Methods + ------- + __init__: Initializes the IssueRole instance. + default_pre_format_text: Default text formatting method. + format_text: Formats text with supported separators. + make_node: Creates a docutils node for the reference. + __call__: Processes text and returns formatted reference nodes. + """ + + # Symbols used to separate and issue/pull request/merge request etc + # i.e + # - group/project#2323 for issues + # - group/project!1234 for merge requests (in gitlab) + # - group/project@adbc1234 for commits + ELEMENT_SEPARATORS = "#@!" + + EXTERNAL_REPO_REGEX = re.compile(rf"^(\w+)/(.+)([{ELEMENT_SEPARATORS}])([\w]+)$") + + def __init__( + self, + config_prefix: str, + pre_format_text: Callable[[Config, str], str] = None, + ): + """ + Initializes the IssueRole instance. + + Parameters + ---------- + config_prefix (str): The prefix used for configuration options. + pre_format_text (Callable[[Config, str], str], optional): A function for pre-formatting text before generating the reference. + """ + self.uri_config = f"{config_prefix}_uri" + self.separator_config = f"{config_prefix}_prefix" + self.pre_format_text = pre_format_text or self.default_pre_format_text + + @staticmethod + def default_pre_format_text(config: Config, text: str) -> str: + """ + Default method for pre-formatting text. + + Parameters + ---------- + config (Config): The configuration object. + text (str): The text to format. + + Returns + ------- + str: The formatted text. + """ + return text + + def format_text(self, config: Config, issue_no: str) -> str: + """ + Formats the issue number with the appropriate separator. + + Parameters + ---------- + config (Config): The configuration object. + issue_no (str): The issue number to format. + + Returns + ------- + str: The formatted issue number with the separator. + + Raises + ------ + ValueError: If an invalid separator is specified in the configuration. + """ + separator = getattr(config, self.separator_config) + if separator not in self.ELEMENT_SEPARATORS: + raise ValueError( + f"Option {self.separator_config} has to be one of " f"{', '.join(self.ELEMENT_SEPARATORS)}." + ) + text = self.pre_format_text(config, issue_no.lstrip(self.ELEMENT_SEPARATORS)) + return f"{separator}{text}" + + def make_node(self, name: str, issue_no: str, config: Config, options=None): + """ + Creates a docutils node for the given issue number. + + Parameters + ---------- + name (str): The role name. + issue_no (str): The issue number. + config (Config): The configuration object. + options (dict, optional): Additional options for the node. + + Returns + ------- + docutils.nodes.reference: A reference node pointing to the issue. + + Note: + Handles both internal and external repository references. + """ + if issue_no in ("-", "0"): + return None + + options = options or {} + + has_explicit_title, title, target = split_explicit_title(issue_no) + + if has_explicit_title: + issue_no = str(target) + + repo_match = self.EXTERNAL_REPO_REGEX.match(issue_no) + + if repo_match: + # External repo + group, project, original_separator, issue_no = repo_match.groups() + text = f"{group}/{project}{self.format_text(config, issue_no)}" + ref = _get_uri( + self.uri_config, + config, + issue_no, + (group, project), + ) + else: + text = self.format_text(config, issue_no) + ref = _get_uri(self.uri_config, config, issue_no) + if has_explicit_title: + return nodes.reference(text=title, refuri=ref, **options) + else: + return nodes.reference(text=text, refuri=ref, **options) + + def __call__(self, name, rawtext, text, lineno, inliner, options=None, content=None): + """ + Processes the raw text and returns a list of reference nodes. + + Called by docutils when the role is invoked in the documentation. + + Parameters + ---------- + name (str): The role name. + rawtext (str): The entire markup snippet, including the role. + text (str): The text marked with the role. + lineno (int): The line number where the role occurs. + inliner (docutils.parsers.rst.states.Inliner): The inliner instance. + options (dict, optional): Directive options for further customization. + content (list, optional): The directive content for nested parsing. + + Returns + ------- + tuple: A two-item tuple containing a list of nodes and a list of system messages. + """ + options = options or {} + content = content or [] + issue_nos = [each.strip() for each in utils.unescape(text).split(",")] + config = inliner.document.settings.env.app.config + ret = [] + for i, issue_no in enumerate(issue_nos): + node = self.make_node(name, issue_no, config, options=options) + ret.append(node) + if i != len(issue_nos) - 1: + sep = nodes.raw(text=", ", format="html") + ret.append(sep) + return ret, [] + + +"""Sphinx role for linking to an issue. Must have +`issues_uri` or `issues_default_group_project` configured in ``conf.py``. +Examples: :: + :issue:`123` + :issue:`42,45` + :issue:`sloria/konch#123` +""" +issue_role = IssueRole( + config_prefix="issues", +) + +"""Sphinx role for linking to a pull request. Must have +`issues_pr_uri` or `issues_default_group_project` configured in ``conf.py``. +Examples: :: + :pr:`123` + :pr:`42,45` + :pr:`sloria/konch#43` +""" +pr_role = IssueRole( + config_prefix="issues_pr", +) + + +def format_commit_text(config, sha): + """ + Formats a commit SHA to a shorter version. + + This function truncates the given commit SHA to its first 7 characters, which is a common short form for representing commit hashes. + + Parameters + ---------- + config (Config): The configuration object. Currently not used in the function but included for consistency and potential future use. + sha (str): The full commit SHA string. + + Returns + ------- + str: A truncated version of the commit SHA, consisting of the first 7 characters. + """ + return sha[:7] + + +"""Sphinx role for linking to a commit. Must have +`issues_commit_uri` or `issues_default_group_project` configured in ``conf.py``. +Examples: :: + :commit:`123abc456def` + :commit:`sloria/konch@123abc456def` +""" +commit_role = IssueRole( + config_prefix="issues_commit", + pre_format_text=format_commit_text, +) + +"""Sphinx role for linking to a user profile. Defaults to linking to +GitHub profiles, but the profile URIS can be configured via the +``issues_user_uri`` config value. + +Examples: :: + + :user:`sloria` + +Anchor text also works: :: + + :user:`Steven Loria ` +""" +user_role = IssueRole(config_prefix="issues_user") + + +def setup(app): + """ + Configures the Sphinx application with custom settings for issue tracking and formatting. + + This function is used to set up various configurations for linking issues, pull requests (PRs), commits, user profiles, and more in Sphinx documentation. It defines custom URI templates and prefixes for these entities and registers several roles for inline markup in reStructuredText. + + Parameters + ---------- + app (sphinx.application.Sphinx): The Sphinx application object. + + Returns + ------- + dict: A dictionary containing the extension version and compatibility flags for parallel read and write operations. + + Note: + This function adds several configuration values to the Sphinx app, related to issue tracking and referencing in documentation. It also registers custom roles like 'issue', 'pr', 'user', 'commit', etc., for inline linking in the documentation. + """ + # Format template for issues URI + # e.g. 'https://github.com/sloria/marshmallow/issues/{issue} + app.add_config_value( + "issues_uri", + default="https://github.com/{group}/{project}/issues/{issue}", + rebuild="html", + types=[str], + ) + app.add_config_value("issues_prefix", default="#", rebuild="html", types=[str]) + # Format template for PR URI + # e.g. 'https://github.com/sloria/marshmallow/pull/{issue} + app.add_config_value( + "issues_pr_uri", + default="https://github.com/{group}/{project}/pull/{pr}", + rebuild="html", + types=[str], + ) + app.add_config_value("issues_pr_prefix", default="#", rebuild="html", types=[str]) + # Format template for commit URI + # e.g. 'https://github.com/sloria/marshmallow/commits/{commit} + app.add_config_value( + "issues_commit_uri", + default="https://github.com/{group}/{project}/commit/{commit}", + rebuild="html", + types=[str], + ) + app.add_config_value("issues_commit_prefix", default="@", rebuild="html", types=[str]) + # There is no seperator config as a format_text function is given + + # Default User (Group)/Project eg. 'sloria/marshmallow' + # Called github as the package was working with github only before + app.add_config_value("issues_github_path", default=None, rebuild="html", types=[str]) + # Same as above but with new naming to reflect the new functionality + # Only on of both can be set + app.add_config_value("issues_default_group_project", default=None, rebuild="html", types=[str]) + # Format template for user profile URI + # e.g. 'https://github.com/{user}' + app.add_config_value( + "issues_user_uri", + default="https://github.com/{user}", + rebuild="html", + types=[str], + ) + app.add_config_value("issues_user_prefix", default="@", rebuild="html", types=[str]) + app.add_role("issue", issue_role) + app.add_role("pr", pr_role) + app.add_role("user", user_role) + app.add_role("commit", commit_role) + app.add_role("cve", cve_role) + app.add_role("cwe", cwe_role) + return { + "version": __version__, + "parallel_read_safe": True, + "parallel_write_safe": True, + } diff --git a/docs/extensions/typed_returns.py b/docs/extensions/typed_returns.py new file mode 100644 index 000000000..113520471 --- /dev/null +++ b/docs/extensions/typed_returns.py @@ -0,0 +1,32 @@ +# code from https://github.com/theislab/scanpy/blob/master/docs/extensions/typed_returns.py +# with some minor adjustment +from __future__ import annotations + +import re +from collections.abc import Generator, Iterable + +from sphinx.application import Sphinx +from sphinx.ext.napoleon import NumpyDocstring + + +def _process_return(lines: Iterable[str]) -> Generator[str, None, None]: + for line in lines: + if m := re.fullmatch(r"(?P\w+)\s+:\s+(?P[\w.]+)", line): + yield f'-{m["param"]} (:class:`~{m["type"]}`)' + else: + yield line + + +def _parse_returns_section(self: NumpyDocstring, section: str) -> list[str]: + lines_raw = self._dedent(self._consume_to_next_section()) + if lines_raw[0] == ":": + del lines_raw[0] + lines = self._format_block(":returns: ", list(_process_return(lines_raw))) + if lines and lines[-1]: + lines.append("") + return lines + + +def setup(app: Sphinx): + """Set app.""" + NumpyDocstring._parse_returns_section = _parse_returns_section diff --git a/docs/faq/index.rst b/docs/faq/index.rst new file mode 100644 index 000000000..c2a512bd4 --- /dev/null +++ b/docs/faq/index.rst @@ -0,0 +1,7 @@ +FAQ +=== + +.. toctree:: + :maxdepth: 1 + + signalp_installation diff --git a/docs/faq/signalp_installation.rst b/docs/faq/signalp_installation.rst new file mode 100644 index 000000000..33c9c9f81 --- /dev/null +++ b/docs/faq/signalp_installation.rst @@ -0,0 +1,77 @@ +SignalP Peptide Prediction Integration +====================================== + +Our program integrates peptide prediction functionality using SignalP. To enable this feature, please follow these steps: + +1. Activate SignalP in your program by setting the parameter ``use_signalP=True``. +2. Acquire an academic license for SignalP and download it `from the official site `_. +3. Extract the perl file from the downloaded tarball (signalp-4.1g.Linux.tar.gz) and move it to ``/usr/bin/signalp``. + +To run the program with SignalP, use the following command: + +.. code-block:: bash + + run_dbcan EscheriaColiK12MG1655.fna prok --out_dir output_EscheriaColiK12MG1655 --use_signalP=TRUE + +.. warning:: + If you lack permission to access `/usr/bin`, specify the path of the SignalP executable file using the `-sp` or `--signalP_path` parameter. Here's an example command: + + .. code-block:: bash + + run_dbcan EscheriaColiK12MG1655.fna prok --out_dir output_EscheriaColiK12MG1655 --use_signalP=TRUE -sp /home/lehuang/Downloads/signalp-4.1/signalp + +SignalP-4.1 Installation Instructions +------------------------------------- + +Begin by decompressing the SignalP tarball and navigating to its directory: + +.. code-block:: bash + + tar -xvf signalp-4.1g.Linux.tar.gz && cd signalp-4.1 + +Inside the `signalp-4.1` directory, you'll find the following files and directories: + +.. code-block:: bash + + (base) lehuang@lehuang:~/Downloads/signalp-4.1$ ls + bin lib signalp signalp.1 signalp-4.1.readme syn test + +The `signalp` file is the perl script that will be utilized in your program. + +Customizing the SignalP Script +------------------------------ + +Modify the "GENERAL SETTINGS, CUSTOMIZE ..." section at the start of the `signalp` file. Ensure these mandatory variables are correctly set: + +- **SIGNALP**: Specify the full path to the signalp-4.1 directory on your system. +- **outputDir**: Choose a directory for storing temporary files (must be writable by all users). +- **MAX_ALLOWED_ENTRIES**: Define the maximum number of input sequences allowed per run. + +Here's an example of how to configure these settings in the `signalp` file: + +.. code-block:: bash + + ############################################################################## + # GENERAL SETTINGS: CUSTOMIZE TO YOUR SITE + ############################################################################## + + # Full path to the signalp-4.1 directory (mandatory) + BEGIN { + $ENV{SIGNALP} = '/home/lehuang/Downloads/signalp-4.1'; + } + + # Directory for temporary files (writable by all users) + my $outputDir = "/home/lehuang/Downloads/signalp-4.1/output"; + + # Max number of sequences per run (flexible) + my $MAX_ALLOWED_ENTRIES=100000; + +Copying the SignalP Script to /usr/bin (if accessible) +------------------------------------------------------ + +If you have the necessary permissions, use these commands to copy the `signalp` script: + +.. code-block:: bash + + sudo cp signalp /usr/bin/signalp + sudo chmod 755 /usr/bin/signalp diff --git a/docs/index.md b/docs/index.md new file mode 100644 index 000000000..1bf75273b --- /dev/null +++ b/docs/index.md @@ -0,0 +1,51 @@ +```{include} ../README.md + +``` + +## Resources + +::::{grid} 1 2 3 3 +:gutter: 2 + +:::{grid-item-card} Installation {octicon}`plug;1em;` +:link: installation +:link-type: doc + +New to _run_dbcan_? Check out the installation guide. +::: + +:::{grid-item-card} User guide {octicon}`play;1em;` +:link: user_guide/index +:link-type: doc + +The user guide provides detailed descriptions about how to _annotating_ carbohydrate-active enzyme using `run_dbcan`. +::: + +:::{grid-item-card} API reference {octicon}`book;1em;` +:link: api/index +:link-type: doc + +The API reference contains a detailed description of +the `run_dbcan` API. +::: + +:::{grid-item-card} GitHub {octicon}`mark-github;1em;` +:link: https://github.com/linnabrown/run_dbcan + +Find a bug? Interested in improving `run_dbcan`? Checkout our GitHub for the latest developments. +::: +:::: + +```{toctree} +:hidden: true +:maxdepth: 1 + +installation +user_guide/index +api/index +change-logs/index +contributors +references +template_usage.md +faq/index +``` diff --git a/docs/installation.rst b/docs/installation.rst new file mode 100644 index 000000000..c0b0470d1 --- /dev/null +++ b/docs/installation.rst @@ -0,0 +1,112 @@ +Installation +============ + +We support different ways to install `dbcan`_, including: + +- Using `Anaconda`_ or `Miniconda`_ (**Recommended**) +- Using `PyPI`_ +- Using `Docker`_ + +.. note:: + + If you prefer not installing `dbcan`_ locally, you can also use it via our online `server `_. + +Requirements +------------ + +- A Posix-compliant operating system, e.g. ``Linux`` or ``MacOS``. +- A ``Python`` 3.6 or later environment (you can use ``conda`` to create it). +- When using ``Conda`` or ``PyPI`` to install `dbcan`_, you also need to prepare the ``databases`` used by `dbcan`_ seperately (See :doc:`user_guide/database_preparation`). + + +Installing with Conda +--------------------- + +If you haven't already installed ``conda``, you need to install a ``conda`` environment. ``Conda`` is available through the `Anaconda `_ +or `Miniconda `_. Then, you can create a new ``conda`` environment (optional but recommended) using the command: + +.. code-block:: shell + + conda create --dbcan python=3.8 + +If you already have a ``conda`` environment, you can skip the step above. + +To install the `dbcan`_ package, use the ``conda install`` command: + +.. code-block:: shell + + conda install dbcan -c conda-forge -c bioconda + +Installing with PyPI +-------------------- + +To install the `dbcan`_ package via ``pip``, you first need to install a few executable +dependencies: + +- `NCBI-BLAST+ `_; +- `HMMER `_ (:cite:`2011:hmmer`); +- `DIAMOND `_ (:cite:`2021:diamond`); +- `SignalP `_ (:cite:`2017:nielsen`) (Optional). + +.. warning:: + + **SignalP Integration Notice** + + Due to the specific licensing terms of `SignalP`, it is not included directly as a dependency in our package. This requires users to undertake a separate installation process. + + **Installing SignalP (Optional)**: + - `SignalP` is optional and not essential for the core functionality of our software. Users requiring its specific features can integrate it as follows: + 1. Visit the `SignalP website `_. + 2. Submit a download `request `_. + 3. Post-download, add `SignalP` to your system's environmental variables to make it executable. + - For installation assistance, refer to the :doc:`faq/signalp_installation`. + + This approach ensures compliance with `SignalP`'s licensing while offering the tool's functionality to those who need it. + + + +After the dependencies are installed, `dbcan`_ can be installed via `PyPI `_: + +.. code-block:: shell + + pip install dbcan + +.. note:: + + Since ``PyPI`` doesn't have an independent build system, the dependencies of dbcan need to be installed seperatedly. + Therefore, we recommended users to install ``dbcan`` via ``Conda`` which can resolve all dependencies automatically. + +Installing with Docker +---------------------- + +To use `dbcan`_ via `Docker `_, please follow these +steps: + +1. Install ``Docker`` on your system (e.g. Linux, MacOS); +2. Pull the image `haidyi/run_dbcan `_ from `Docker Hub `_; +3. Run the ``run_dbcan`` tool via Docker: + + .. code-block:: shell + + docker run -it haidyi/run_dbcan:latest [args] --out_dir + + .. note:: + + To use your own local files as input when using Docker, make sure the local files are ``mounted`` and visible to your container. + +Check Installation +------------------ + +After installation, you can check if `dbcan`_ is successfully installed by running: + +.. code-block:: shell + + run_dbcan -h + +If it shows all the help information, congratulations! You are ready to annotate your own proteins right now. + +.. _dbcan: https://github.com/linnabrown/run_dbcan/ +.. _Anaconda: https://docs.anaconda.com/free/anaconda/ +.. _Miniconda: https://docs.conda.io/projects/miniconda/en/latest/ +.. _PyPI: https://pypi.org/ +.. _Docker: https://www.docker.com/ diff --git a/docs/references.bib b/docs/references.bib new file mode 100644 index 000000000..8a8936742 --- /dev/null +++ b/docs/references.bib @@ -0,0 +1,30 @@ +@article{2021:diamond, + title={Sensitive protein alignments at tree-of-life scale using DIAMOND}, + author={Buchfink, Benjamin and Reuter, Klaus and Drost, Hajk-Georg}, + journal={Nature methods}, + volume={18}, + number={4}, + pages={366--368}, + year={2021}, + publisher={Nature Publishing Group US New York} +} + +@article{2011:hmmer, + title={Accelerated profile HMM searches}, + author={Eddy, Sean R}, + journal={PLoS computational biology}, + volume={7}, + number={10}, + pages={e1002195}, + year={2011}, + publisher={Public Library of Science San Francisco, USA} +} + +@article{2017:nielsen, + title={Predicting secretory proteins with SignalP}, + author={Nielsen, Henrik}, + journal={Protein function prediction: methods and protocols}, + pages={59--73}, + year={2017}, + publisher={Springer} +} diff --git a/docs/references.rst b/docs/references.rst new file mode 100644 index 000000000..4497b53a0 --- /dev/null +++ b/docs/references.rst @@ -0,0 +1,5 @@ +References +========== + +.. bibliography:: + :cited: diff --git a/docs/user_guide/database_preparation.rst b/docs/user_guide/database_preparation.rst new file mode 100644 index 000000000..35775e4ed --- /dev/null +++ b/docs/user_guide/database_preparation.rst @@ -0,0 +1,24 @@ +Database preparation +==================== + +Install different databases and make index for them. + +.. code-block:: shell + + test -d db || mkdir db + cd db \ + && wget http://bcb.unl.edu/dbCAN2/download/Databases/fam-substrate-mapping-08012023.tsv && mv fam-substrate-mapping-08012023.tsv fam-substrate-mapping.tsv \ + && wget http://bcb.unl.edu/dbCAN2/download/Databases/PUL_12112023.faa && mv PUL_12112023.faa PUL.faa && makeblastdb -in PUL.faa -dbtype prot \ + && wget http://bcb.unl.edu/dbCAN2/download/Databases/dbCAN-PUL_12-12-2023.xlsx && mv dbCAN-PUL_12-12-2023.xlsx dbCAN-PUL.xlsx \ + && wget http://bcb.unl.edu/dbCAN2/download/Databases/dbCAN-PUL_12-12-2023.txt && mv dbCAN-PUL_12-12-2023.txt dbCAN-PUL.txt \ + && wget http://bcb.unl.edu/dbCAN2/download/Databases/dbCAN-PUL.tar.gz && tar xvf dbCAN-PUL.tar.gz && rm dbCAN-PUL.tar.gz \ + && wget https://bcb.unl.edu/dbCAN2/download/Databases/dbCAN_sub.hmm && hmmpress dbCAN_sub.hmm \ + && wget https://bcb.unl.edu/dbCAN2/download/Databases/V12/CAZyDB.07262023.fa && mv CAZyDB.07262023.fa CAZyDB.fa && diamond makedb --in CAZyDB.fa -d CAZy \ + && wget https://bcb.unl.edu/dbCAN2/download/Databases/V12/dbCAN-HMMdb-V12.txt && mv dbCAN-HMMdb-V12.txt dbCAN.txt && hmmpress dbCAN.txt \ + && wget https://bcb.unl.edu/dbCAN2/download/Databases/V12/tcdb.fa && diamond makedb --in tcdb.fa -d tcdb \ + && wget http://bcb.unl.edu/dbCAN2/download/Databases/V12/tf-1.hmm && hmmpress tf-1.hmm \ + && wget http://bcb.unl.edu/dbCAN2/download/Databases/V12/tf-2.hmm && hmmpress tf-2.hmm \ + && wget https://bcb.unl.edu/dbCAN2/download/Databases/V12/stp.hmm && hmmpress stp.hmm \ + && cd ../ && wget http://bcb.unl.edu/dbCAN2/download/Samples/EscheriaColiK12MG1655.fna \ + && wget http://bcb.unl.edu/dbCAN2/download/Samples/EscheriaColiK12MG1655.faa \ + && wget http://bcb.unl.edu/dbCAN2/download/Samples/EscheriaColiK12MG1655.gff diff --git a/docs/user_guide/index.rst b/docs/user_guide/index.rst new file mode 100644 index 000000000..d3e45c2ec --- /dev/null +++ b/docs/user_guide/index.rst @@ -0,0 +1,12 @@ +User Guide +========== + +.. toctree:: + :maxdepth: 1 + + database_preparation + quick_start + run_from_protein_sequence + run_with_CGCFinder + run_from_raw_reads + run_from_DNA_sequence diff --git a/docs/user_guide/quick_start.rst b/docs/user_guide/quick_start.rst new file mode 100644 index 000000000..774fb1a2a --- /dev/null +++ b/docs/user_guide/quick_start.rst @@ -0,0 +1,50 @@ +Quick Start +=========== + +This section provides a quick guide to running the run_dbcan tool suite with example data and explains the output files generated. + +1. Running Example Data +----------------------- + +To run the dbCAN tool suite on the `Escherichia coli Strain MG1655`_ example data, use the following command. The input file `EscheriaColiK12MG1655.fna` represents the FASTA format complete genome DNA sequence, and `prok` specifies that the organism is a prokaryote. + +.. code-block:: shell + + run_dbcan EscheriaColiK12MG1655.fna prok --out_dir output_EscheriaColiK12MG1655 + +.. _Escherichia coli Strain MG1655: https://www.ncbi.nlm.nih.gov/nuccore/U00096.2 + +2. Understanding the Output +--------------------------- + +After running the tool, several output files are generated in `output_EscheriaColiK12MG1655`, each with specific information: + +**uniInput** + The unified input file for subsequent tools, created by Prodigal if a nucleotide sequence is used. + +**dbsub.out** + Output from the dbCAN_sub run. + +**diamond.out** + Results from the Diamond BLAST. + +**hmmer.out** + Output from the HMMER run. + +**tf.out** + Diamond BLAST output predicting Transcription Factors (TFs) for CGCFinder. + +**tc.out** + Diamond BLAST output predicting Transporter Classifications (TCs) for CGCFinder. + +**cgc.gff** + GFF input file for CGCFinder. + +**cgc.out** + Output from the CGCFinder run. + +**cgc_standard.out** + Simplified version of cgc.out, containing columns like CGC_id, Type, Contig_id, Gene_id, Start, End, Strand, and Annotation. + +**overview.txt** + Summarizes CAZyme predictions across tools, including SignalP results. diff --git a/docs/user_guide/run_from_DNA_sequence.rst b/docs/user_guide/run_from_DNA_sequence.rst new file mode 100644 index 000000000..31438d08b --- /dev/null +++ b/docs/user_guide/run_from_DNA_sequence.rst @@ -0,0 +1,13 @@ +Run from Protein Sequence +========================= + +This section provides a quick guide to running the run_dbcan tool suite with example data and explains the output files generated. + + +To run the dbCAN tool suite on the `Escherichia coli Strain MG1655`_ example data, use the following command. The input file `EscheriaColiK12MG1655.fna` represents the FASTA format complete genome DNA sequence, and `prok` specifies that the organism is a prokaryote. + +.. code-block:: shell + + run_dbcan EscheriaColiK12MG1655.fna prok --out_dir output_EscheriaColiK12MG1655 + +.. _Escherichia coli Strain MG1655: https://www.ncbi.nlm.nih.gov/nuccore/U00096.2 diff --git a/docs/user_guide/run_from_protein_sequence.rst b/docs/user_guide/run_from_protein_sequence.rst new file mode 100644 index 000000000..722ea3b40 --- /dev/null +++ b/docs/user_guide/run_from_protein_sequence.rst @@ -0,0 +1,12 @@ +Run from Protein Sequence +========================= + +This section provides an example to run the run_dbcan tool suite with protein sequence data. + +To run the dbCAN tool suite on the `Escherichia coli Strain MG1655`_ example data, use the following command. The input file `EscheriaColiK12MG1655.faa` represents the FASTA format complete genome protein sequence, and `prok` specifies that the organism is a prokaryote. + +.. code-block:: shell + + run_dbcan EscheriaColiK12MG1655.faa protein --out_dir output_EscheriaColiK12MG1655 + +.. _Escherichia coli Strain MG1655: https://www.ncbi.nlm.nih.gov/nuccore/U00096.2 diff --git a/docs/user_guide/run_from_raw_reads.rst b/docs/user_guide/run_from_raw_reads.rst new file mode 100644 index 000000000..95d02f8a7 --- /dev/null +++ b/docs/user_guide/run_from_raw_reads.rst @@ -0,0 +1,599 @@ +Run from Raw Reads +================== + +Introduction +------------ + +dbCAN and run_dbcan require assembled contigs for CAZyme annotation. +Typically, microbiome researchers begin with raw sequencing reads (metagenomic or metatranscriptomic) from various samples. +These reads must be pre-processed and assembled prior to annotation. +Additionally, there's often a need for CAZyme abundance comparison +and visualization across multiple samples. To address these requirements, +this protocol paper provides a comprehensive guide on CAZyme annotation. +It includes steps from initial sequencing reads to the visualization of CAZyme occurrence and abundance across samples. +Key topics covered are software setup, read pre-processing, metagenome assembly, gene prediction, +CAZyme and CGC prediction, glycan substrate prediction, and data visualization. + +.. image:: ../_static/img/Picture1.png + :alt: workflow figure + :width: 800px + :align: center + + +For this tutorial, we provide a comprehensive pipeline to teach users how to run CAZyme annotations from raw reads to generate abundance information. +We use Carter2023 and the individual sample assembly route of the figure above. The procedure has 4 modules and 16 steps (P1-P16). +First, we need to create the environment. + +Installation and Data Preparation +--------------------------------- + + +1. Downloading Carter2023 (Table 2) Raw Reads + + +To download the required raw reads, use the following wget commands: + +.. code-block:: shell + + wget https://bcb.unl.edu/dbCAN_toturial/raw_reads/Dry2014_1.fastq.gz + wget https://bcb.unl.edu/dbCAN_toturial/raw_reads/Dry2014_2.fastq.gz + wget https://bcb.unl.edu/dbCAN_toturial/raw_reads/Wet2014_1.fastq.gz + wget https://bcb.unl.edu/dbCAN_toturial/raw_reads/Wet2014_2.fastq.gz + +2. Create Anaconda Environment + + +Create and activate a new Anaconda environment with the following steps: + +.. code-block:: shell + + conda create -n CAZyme_annotation python=3.9 + conda activate CAZyme_annotation + +3. Installing Bioinformatics Dependencies and dbCAN + +Install all necessary bioinformatics tools either with a single command or individually: + +.. code-block:: shell + + conda install -f dbcan.configure + +Alternatively, install the tools one by one: + +.. code-block:: shell + + conda install -c conda-forge -c bioconda -c defaults prokka -y + conda install -c bioconda megahit trim-galore -y + conda install -c bioconda blast bwa diamond -y + conda install -c bioconda hmmer -y + conda install -c bioconda samtools bedtools seqkit -y + conda install -c bioconda kraken2 -y + conda install -c agbiome bbtools + conda install -c bioconda seqtk flye minimap2 + conda install -c conda-forge -c bioconda mmseqs2 + conda install dbcan -c conda-forge -c bioconda + + +4. Database Installation + +To install the databases, execute the following commands: + +.. code-block:: shell + + test -d db || mkdir db + cd db \ + && wget http://bcb.unl.edu/dbCAN2/download/Databases/fam-substrate-mapping-08012023.tsv \ + && wget http://bcb.unl.edu/dbCAN2/download/Databases/PUL_12112023.faa && mv PUL_12112023.faa PUL.faa && makeblastdb -in PUL.faa -dbtype prot \ + && wget http://bcb.unl.edu/dbCAN2/download/Databases/dbCAN-PUL_12-12-2023.xlsx \ + && wget http://bcb.unl.edu/dbCAN2/download/Databases/dbCAN-PUL_12-12-2023.txt \ + && wget http://bcb.unl.edu/dbCAN2/download/Databases/dbCAN-PUL.tar.gz && tar xvf dbCAN-PUL.tar.gz \ + && wget https://bcb.unl.edu/dbCAN2/download/Databases/dbCAN_sub.hmm && hmmpress dbCAN_sub.hmm \ + && wget https://bcb.unl.edu/dbCAN2/download/Databases/V12/CAZyDB.07262023.fa && diamond makedb --in CAZyDB.07262023.fa -d CAZy \ + && wget https://bcb.unl.edu/dbCAN2/download/Databases/V12/dbCAN-HMMdb-V12.txt && mv dbCAN-HMMdb-V12.txt dbCAN.txt && hmmpress dbCAN.txt \ + && wget https://bcb.unl.edu/dbCAN2/download/Databases/V12/tcdb.fa && diamond makedb --in tcdb.fa -d tcdb \ + && wget http://bcb.unl.edu/dbCAN2/download/Databases/V12/tf-1.hmm && hmmpress tf-1.hmm \ + && wget http://bcb.unl.edu/dbCAN2/download/Databases/V12/tf-2.hmm && hmmpress tf-2.hmm \ + && wget https://bcb.unl.edu/dbCAN2/download/Databases/V12/stp.hmm && hmmpress stp.hmm \ + && kraken2-build --standard --db K2 + +The downloaded files must be all in the right location (the db folder). +The CAZyDB.08062022.fa file is needed for DIAMOND search (Table 1). +The dbCAN-HMMdb-V11.txt and dbCAN_sub.hmm files are for HMMER search. +The tcdb.fa, tf-1.hmm, tf-2.hmm, and stp.hmm files are for CGC prediction. +The PUL.faa file consists of protein sequences from experimentally validated PULs for BLAST search to predict substrates for CGCs. +The dbCAN-PUL_07-01-2022.txt and dbCAN-PUL_07-01-2022.xlsx files contain PUL-substrate mapping curated from literature. +Lastly, the fam-substrate-mapping-08252022.tsv file is the family-EC-substrate mapping table for the prediction of CAZyme substrates. + +.. warning:: + The conda installation and configuration step may experience prolonged time while resolving environment dependencies. Users should be patient during this process. Alternatively, users consider "mamba", + another Python package manager that offers similar functionality to Anaconda. + Information and access to mamba software can be found at https://github.com/mamba-org/mamba. + + + +Module 1: Reads processing to obtain contigs +-------------------------------------------- + + +P1. Contamination Check +^^^^^^^^^^^^^^^^^^^^^^^ + +Use `kraken2` to check for contaminated reads: + +.. code-block:: shell + + kraken2 --threads 32 --quick --paired --db K2 --report Wet2014.kreport --output Wet2014. kraken.output Wet2014_1.fastq.gz Wet2014_2.fastq.gz + kraken2 --threads 32 --quick --paired --db K2 --report Dry2014.kreport --output Dry2014. kraken.output Dry2014_1.fastq.gz Dry2014_2.fastq.gz + +Kraken2 found very little contamination in the Carter2023 data. Consequently, there was no need for the contamination removal step. + +If contamination is identified, users can align the reads to the reference genomes of potential +contamination source organisms to remove the aligned reads (Box 1). The most common source in human microbiome studies is from human hosts. + +Box 1: Removing Contamination Reads from Humans +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + + Kraken2 will produce the following output files. + + .. code-block:: shell + + -rw-rw-r-- 1 jinfang jinfang 2.0G Dec 12 10:24 Dry2014.kraken.output + -rw-rw-r-- 1 jinfang jinfang 1.2M Dec 12 10:25 Dry2014.kreport + -rw-rw-r-- 1 jinfang jinfang 5.1G Dec 12 09:47 Wet2014.kraken.output + -rw-rw-r-- 1 jinfang jinfang 1.1M Dec 12 09:48 Wet2014.kreport + + Suppose from these files, we have identified humans as the contamination source, we can use the following commands to remove the contamination reads by aligning reads to the human reference genome. + + .. code-block:: shell + + wget https://ftp.ensembl.org/pub/release-110/fasta/homo_sapiens/dna/Homo_sapiens.GRCh38.dna.primary_assembly.fa.gz + bwa index -p hg38 Homo_sapiens.GRCh38.dna.primary_assembly.fa.gz + bwa mem hg38 Wet2014_1.fastq.gz Wet2014_2.fastq.gz -t 32 -o Wet2014.hg38.sam + bwa mem hg38 Dry2014_1.fastq.gz Dry2014_2.fastq.gz -t 32 -o Dry2014.hg38.sam + samtools view -f 12 Wet2014.hg38.sam > Wet2014.hg38.unmap.bam + samtools view -f 12 Dry2014.hg38.sam > Dry2014.hg38.unmap.bam + samtools fastq -1 Wet2014_1.clean.fq.gz -2 Wet2014_2.clean.fq.gz Wet2014.hg38.unmap.bam + samtools fastq -1 Dry2014_1.clean.fq.gz -2 Dry2014_2.clean.fq.gz Dry2014.hg38.unmap.bam + +P2. Trimming Adapters and Low-Quality Reads +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +.. code-block:: shell + + trim_galore --paired Wet2014_1.fastq.gz Wet2014_2.fastq.gz --illumina -j 36 + trim_galore --paired Dry2014_1.fastq.gz Dry2014_2.fastq.gz --illumina -j 36 + + +Trim_galore is used to trim adapters and low-quality reads. +We specified `--illumina` to indicate that the reads were generated using the Illumina sequencing platform. Nonetheless, trim_galore possesses the ability to automatically detect the adapter, +providing flexibility in adapter handling for users who may know the specific sequencing platform. +Details of trimming are available in the trimming report file (Box 2). + +Box 2: Example output of `trim_galore` +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + + In addition to the trimmed read files, `Trim_galore`` also generates a trimming report file. + The trimming report contains details on read trimming, such as the number of trimmed reads. + + .. code-block:: shell + + -rw-rw-r-- 1 jinfang jinfang 4.2K Dec 13 01:48 Dry2014_1.fastq.gz_trimming_report.txt + -rw-rw-r-- 1 jinfang jinfang 2.0G Dec 13 01:55 Dry2014_1_val_1.fq.gz + -rw-rw-r-- 1 jinfang jinfang 4.4K Dec 13 01:55 Dry2014_2.fastq.gz_trimming_report.txt + -rw-rw-r-- 1 jinfang jinfang 2.4G Dec 13 01:55 Dry2014_2_val_2.fq.gz + -rw-rw-r-- 1 jinfang jinfang 4.4K Dec 13 01:30 Wet2014_1.fastq.gz_trimming_report.txt + -rw-rw-r-- 1 jinfang jinfang 3.4G Dec 13 01:46 Wet2014_1_val_1.fq.gz + -rw-rw-r-- 1 jinfang jinfang 4.6K Dec 13 01:46 Wet2014_2.fastq.gz_trimming_report.txt + -rw-rw-r-- 1 jinfang jinfang 3.7G Dec 13 01:46 Wet2014_2_val_2.fq.gz + +.. warning:: + + During the trimming process, certain reads may be entirely removed due to low quality in its entirety. + Using the --retain_unpaired parameter in trim_galore allows for the preservation of single-end reads. + In this protocol, this option was not select, so that both reads of a forward-revise pair were removed. + +P3. Assemble reads into contigs +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +Use Megahit for assembling reads into contigs: + + +.. code-block:: shell + + megahit -m 0.5 -t 32 -o megahit_ Wet2014 -1 Wet2014_1_val_1.fq.gz -2 Wet2014_2_val_2.fq.gz --out-prefix Wet2014 --min-contig-len 1000 + megahit -m 0.5 -t 32 -o megahit_ Dry2014 -1 Dry2014_1_val_1.fq.gz -2 Dry2014_2_val_2.fq.gz --out-prefix Dry2014 --min-contig-len 1000 + + +MEGAHIT generates two output folders. Each contains five files and one sub-folder (Box 3). +Wet2014.contigs.fa is the final contig sequence file. We set --min-contig-len 1000, +a common practice to retain all contigs longer than 1,000 base pairs. + +Box 3: Example output of `megahit` +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + + .. code-block:: shell + + -rw-rw-r-- 1 jinfang jinfang 262 Dec 13 04:19 checkpoints.txt + -rw-rw-r-- 1 jinfang jinfang 0 Dec 13 04:19 done + drwxrwxr-x 2 jinfang jinfang 4.0K Dec 13 04:19 intermediate_contigs + -rw-rw-r-- 1 jinfang jinfang 1.1K Dec 13 02:22 options.json + -rw-rw-r-- 1 jinfang jinfang 258M Dec 13 04:19 Wet2014.contigs.fa + +P4. Predict Genes with Prokka +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +.. code-block:: shell + + prokka --kingdom Bacteria --cpus 36 --outdir prokka_Wet2014 --prefix Wet2014 --addgenes --addmrna --locustag Wet2014 megahit_Wet2014/Wet2014.contigs.fa + prokka --kingdom Bacteria --cpus 36 --outdir prokka_Dry2014 --prefix Dry2014 --addgenes --addmrna --locustag Dry2014 megahit_Dry2014/Dry2014.contigs.fa + +The parameter --kingdom Bacteria is required for bacterial gene prediction. +To optimize performance, --CPU 36 instructs the utilization of 36 computer processors. +The output files comprise of both protein and CDS sequences in Fasta format (e.g., Wet2014.faa and Wet2014.ffn in Box 4). + + +Box 4: Example output of `Prokka` +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + + .. code-block:: shell + + -rw-rw-r-- 1 jinfang jinfang 8.4M Dec 14 00:51 Wet2014.err + -rw-rw-r-- 1 jinfang jinfang 75M Dec 13 21:38 Wet2014.faa + -rw-rw-r-- 1 jinfang jinfang 204M Dec 13 21:38 Wet2014.ffn + -rw-rw-r-- 1 jinfang jinfang 259M Dec 13 20:47 Wet2014.fna + -rw-rw-r-- 1 jinfang jinfang 264M Dec 13 21:38 Wet2014.fsa + -rw-rw-r-- 1 jinfang jinfang 599M Dec 14 00:52 Wet2014.gbk + -rw-rw-r-- 1 jinfang jinfang 372M Dec 13 21:38 Wet2014.gff + -rw-rw-r-- 1 jinfang jinfang 2.2M Dec 14 00:52 Wet2014.log + -rw-rw-r-- 1 jinfang jinfang 1.2G Dec 14 00:52 Wet2014.sqn + -rw-rw-r-- 1 jinfang jinfang 68M Dec 13 21:38 Wet2014.tbl + -rw-rw-r-- 1 jinfang jinfang 30M Dec 13 21:38 Wet2014.tsv + -rw-rw-r-- 1 jinfang jinfang 152 Dec 13 21:38 Wet2014.txt + +Module 2. run_dbcan annotation to obtain CAZymes, CGCs, and substrates +---------------------------------------------------------------------- + +P5. CAZyme annotation at family level (TIMING ~10min) +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +.. code-block:: shell + + run_dbcan prokka_Wet2014/Wet2014.faa protein --hmm_cpu 32 --out_dir Wet2014.CAZyme --tools hmmer --db_dir db + run_dbcan prokka_Dry2014/Dry2014.faa protein --hmm_cpu 32 --out_dir Dry2014.CAZyme --tools hmmer --db_dir db + +Two arguments are required for run_dbcan: the input sequence file (faa files) and the sequence type (protein). +By default, run_dbcan will use three methods (HMMER vs dbCAN HMMdb, DIAMOND vs CAZy, HMMER vs dbCAN-sub HMMdb) for CAZyme annotation (Table 1, Figure 2). +This default setting is equivalent to the use --tools all parameter (Box 5). +Here we only invoke the HMMER vs dbCAN HMMdb for CAZyme annotation at the family level. + +Box 5: CAZyme annotation with default setting +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +If the --tools parameter is not set, it is the default setting, which is the same as --tools all. +This will take much longer time to finish (~5h) due to the large size of dbCAN-sub HMMdb (used for substrate prediction for CAZymes, see Table 1). + +.. code-block:: shell + + run_dbcan prokka_Wet2014/Wet2014.faa protein --out_dir Wet2014.CAZyme --dia_cpu 32 --hmm_cpu 32 --dbcan_thread 32 --tools all + run_dbcan prokka_Dry2014/Dry2014.faa protein --out_dir Dry2014.CAZyme --dia_cpu 32 --hmm_cpu 32 --dbcan_thread 32 --tools all + +The sequence type can be `protein`, `prok`, `meta`. If the input sequence file contains metagenomic contig sequences (`fna` file), +the sequence type has to be meta, and prodigal will be called to predict genes. + +.. code-block:: shell + + run_dbcan prokka_Wet2014/Wet2014.fna meta --out_dir Wet2014.CAZyme --dia_cpu 32 --hmm_cpu 32 --dbcan_thread 32 + run_dbcan prokka_Dry2014/Dry2014.fna meta --out_dir Dry2014.CAZyme --dia_cpu 32 --hmm_cpu 32 --dbcan_thread 32 + +5.1. Combine proteins from multiple samples + +.. warning:: + As shown in Figure 3 (step3), proteins from multiple samples can be combined to generate a non-redundant set of proteins. + This will reduce the runtime for the run_dbcan step (step4), as only one faa file will be processed. + However, this does not work for the CGC prediction, as contigs (fna files) from each sample will be needed. + Therefore, this step (5.1) is recommended if users only want the CAZyme annotation, and not recommended if CGCs are also to be predicted. + + +This protein sequence clustering step will create a mapping table with sequence cluster ID and protein IDs from each sample. + +.. code-block:: shell + + mkdir mmseqs_cluster && cd mmseqs_cluster + ln -s ../db . + cat ../prokka_Wet2014/Wet2014.faa ../prokka_Dry2014/Dry2014.faa > Dry_Wet.faa + mmseqs easy-cluster --threads 32 -c 0.95 --min-seq-id 0.95 --cov-mode 2 Dry_Wet.faa Dry_Wet_cluster tmp + mv Dry_Wet_cluster_cluster_rep.fasta Dry_Wet.cluster.faa + +This `Dry_Wet.cluster.faa` file now contains the non-redundant set of proteins from the two samples. + +.. code-block:: shell + + grep "^>" Dry_Wet.cluster.faa | tr ">" " " |awk '{print $1}' > Dry_Wet.geneids + seqkit grep -f Dry_Wet.geneids ../prokka_Dry2014/Wet2014.ffn > Dry_Wet.ffn + seqkit grep -f Dry_Wet.geneids ../prokka_Dry2014/Dry2014.ffn >> Dry_Wet.ffn + +This `Dry_Wet.ffn file` now contains the CDS sequences of the non-redundant set of proteins from the two samples. + +.. code-block:: shell + + bwa index Dry_Wet.ffn + ln -s ../Dry2014_1_val_1.fq.gz . && ln -s ../Dry2014_2_val_2.fq.gz . && ln -s ../Wet2014_2_val_2.fq.gz . && ln -s ../Wet2014_1_val_1.fq.gz . + bwa mem -t 32 -o samfiles/Wet2014.CDS.sam Dry_Wet.ffn Wet2014_1_val_1.fq.gz Wet2014 _2_val_2.fq.gz + bwa mem -t 32 -o samfiles/Dry2014.CDS.sam Dry_Wet.ffn Dry2014_1_val_1.fq.gz Dry2014_2_val_2.fq.gz + +The two sam files now contain the read mapping result from each sample to the `Dry_Wet.ffn` file. + +P6. CGC prediction. +^^^^^^^^^^^^^^^^^^^ + +The following commands will re-run run_dbcan to not only predict CAZymes but also CGCs with protein `faa` and gene location `gff` files. + +.. code-block:: shell + + run_dbcan prokka_Wet2014/Wet2014.faa protein --tools hmmer --tf_cpu 32 --stp_cpu 32 -c prokka_Wet2014/Wet2014.gff --out_dir Wet2014.PUL --dia_cpu 32 --hmm_cpu 32 + run_dbcan prokka_Dry2014/Dry2014.faa protein --tools hmmer --tf_cpu 32 --stp_cpu 32 -c prokka_ Dry2014/Dry2014.gff --out_dir Dry2014.PUL --dia_cpu 32 --hmm_cpu 32 + +As mentioned above (Table 1, Figure 2), +CGC prediction is a featured function added into dbCAN2 in 2018. +To identify CGCs with the protein sequence type, +a gene location file (gff) must be provided together. +If the input sequence type is prok or meta, meaning users only have contig fna files, the CGC prediction can be activated by setting -c cluster. + +.. warning:: + + **Creating own gff file** + If the users would like to create their own gff file (instead of using Prokka or Prodigal), + it is important to make sure the value of ID attribute in the gff file matches the protein ID in the protein faa file. + + **CGC not found** + If no result is found in CGC output file, it is most likely because the sequence IDs in gff file and faa file do not match. Another less likely reason is that the contigs are too short and fragmented and not suitable for CGC prediction. + +P7. Substrate prediction for CAZymes and CGCs (TIMING ~5h) +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +The following commands will re-run run_dbcan to predict CAZymes, CGCs, and their substrates with the `--cgc_substrate` parameter. + +.. code-block:: shell + + run_dbcan prokka_Wet2014/Wet2014.faa protein --dbcan_thread 32 --tf_cpu 32 --stp_cpu 32 -c prokka_Wet2014/Wet2014.gff --cgc_substrate --hmm_cpu 32 --out_dir Wet2014.dbCAN --dia_cpu 32 + run_dbcan prokka_Dry2014/Dry2014.faa protein --dbcan_thread 32 --stp_cpu 32 -c prokka_Dry2014/Dry2014.gff --cgc_substrate --out_dir Dry2014.dbCAN --dia_cpu 32 --hmm_cpu 32 --tf_cpu 32 + +.. warning:: + The above commands do not set the --tools parameter, + which means all three methods for CAZyme annotation will be activated (Box 5). + Because dbCAN-sub HMMdb (for CAZyme substrate prediction) is 200 times larger than dbCAN HMMdb, + the runtime will be much longer. Users can specify --tools hmmer, so that the HMMER search against dbCAN-sub will be disabled. + However, this will turn off the substrate prediction for CAZymes and CGCs based on CAZyme substrate majority voting. + Consequently, the substrate prediction will be solely based on homology search against PULs in dbCAN-PUL + +.. code-block:: shell + + run_dbcan prokka_Wet2014/Wet2014.faa protein --tools hmmer --stp_cpu 32 -c prokka_Wet2014/Wet2014.gff --cgc_substrate --out_dir Wet2014.PUL.Sub --dia_cpu 32 --hmm_cpu 32 --tf_cpu 32 + run_dbcan prokka_Dry2014/Dry2014.faa protein --tools hmmer --stp_cpu 32 -c prokka_Dry2014/Dry2014.gff --cgc_substrate --out_dir Dry2014.PUL.Sub --dia_cpu 32 --hmm_cpu 32 --tf_cpu 32 + +.. warning:: + The above commands do not set the --tools parameter, which means all three methods for CAZyme annotation will be activated (Box 5). + Because dbCAN-sub HMMdb (for CAZyme substrate prediction) is 200 times larger than dbCAN HMMdb, the runtime will be much longer. + Users can specify --tools hmmer, so that the HMMER search against dbCAN-sub will be disabled. + However, this will turn off the substrate prediction for CAZymes and CGCs based on CAZyme substrate majority voting. + Consequently, the substrate prediction will be solely based on homology search against PULs in dbCAN-PUL (Figure 1, Table 1). + + .. code-block:: shell + + run_dbcan prokka_Wet2014/Wet2014.faa protein --tools hmmer --stp_cpu 32 -c prokka_Wet2014/Wet2014.gff --cgc_substrate --out_dir Wet2014.PUL.Sub --dia_cpu 32 --hmm_cpu 32 --tf_cpu 32 + run_dbcan prokka_Dry2014/Dry2014.faa protein --tools hmmer --stp_cpu 32 -c prokka_Dry2014/Dry2014.gff --cgc_substrate --out_dir Dry2014.PUL.Sub --dia_cpu 32 --hmm_cpu 32 --tf_cpu 32 + + +Box 6. Example Output Folder Content of run_dbcan Substrate Prediction +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + + The output directory of run_dbcan substrate prediction typically contains 17 files and 1 folder: + + .. code-block:: shell + + -rw-rw-r-- 1 jinfang jinfang 33M Dec 17 09:36 blastp.out + -rw-rw-r-- 1 jinfang jinfang 3.3M Dec 17 09:35 CAZyme.pep + -rw-rw-r-- 1 jinfang jinfang 18M Dec 17 09:35 cgc.gff + -rw-rw-r-- 1 jinfang jinfang 836K Dec 17 09:35 cgc.out + -rw-rw-r-- 1 jinfang jinfang 374K Dec 17 09:35 cgc_standard.out + -rw-rw-r-- 1 jinfang jinfang 1.8M Dec 17 09:35 cgc_standard.out.json + -rw-rw-r-- 1 jinfang jinfang 785K Dec 17 09:31 dbsub.out + -rw-rw-r-- 1 jinfang jinfang 511K Dec 17 09:31 diamond.out + -rw-rw-r-- 1 jinfang jinfang 638K Dec 17 09:31 dtemp.out + -rw-rw-r-- 1 jinfang jinfang 414K Dec 17 09:31 hmmer.out + -rw-rw-r-- 1 jinfang jinfang 386K Dec 17 09:35 overview.txt + -rw-rw-r-- 1 jinfang jinfang 2.8M Dec 17 09:35 stp.out + -rw-rw-r-- 1 jinfang jinfang 63K Dec 17 09:36 sub.prediction.out + drwxrwxr-x 2 jinfang jinfang 36K Dec 17 09:39 syntenic.svg + -rw-rw-r-- 1 jinfang jinfang 799K Dec 17 09:32 tf-1.out + -rw-rw-r-- 1 jinfang jinfang 645K Dec 17 09:34 tf-2.out + -rw-rw-r-- 1 jinfang jinfang 2.3M Dec 17 09:35 tp.out + -rw-rw-r-- 1 jinfang jinfang 75M Dec 17 02:07 uniInput + + Descriptions of Key Output Files: + + - `blastp.out`: BLAST results between CGCs and PULs. + - `CAZyme.pep`: Fasta sequences of CAZymes. + - `cgc.gff`: Reformatted user input GFF file, marking CAZymes, TFs, TCs, and STPs. + - `cgc.out`: Raw output of CGC predictions. + - `cgc_standard.out`: Simplified version of `cgc.out` in TSV format for easy parsing (refer to Box 7 for columns). + - `cgc_standard.out.json`: JSON format of `cgc_standard.out`. + - `dbsub.out`: HMMER search result against dbCAN-sub HMMdb, with CAZyme substrates extracted from fam-substrate-mapping-08252022.tsv. + - `diamond.out`: DIAMOND search result against the CAZy annotated protein sequences (CAZyDB.08062022.fa). + - `dtemp.out`: Temporary file. + - `hmmer.out`: HMMER search result against dbCAN HMMdb. + - `overview.txt`: Summary of CAZyme annotation from three methods in TSV format (refer to Box 7 for columns). + - `stp.out`: HMMER search result against the MiST65 compiled signal transduction protein HMMs from Pfam. + - `tf-1.out` and `tf-2.out`: HMMER search results against transcription factor HMMs from Pfam and Superfamily databases. + - `tp.out`: DIAMOND search result against the TCDB annotated protein sequences. + - `sub.prediction.out`: Summary of substrate prediction results (refer to Box 7) for CGCs. + - `syntenic.svg`: Syntenic block alignment plots between all CGCs and PULs. + - `uniInput`: Renamed Fasta file from input protein sequence file. + + + +Module 3. Read mapping (Figure 3) to calculate abundance for CAZyme families, subfamilies, CGCs, and substrates +--------------------------------------------------------------------------------------------------------------- + +P8. Read mapping to all CDS of each sample (TIMING ~20 min) +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +.. code-block:: shell + + bwa index prokka_Wet2014/Wet2014.ffn + bwa index prokka_Dry2014/Dry2014.ffn + mkdir samfiles + bwa mem -t 32 -o samfiles/Wet2014.CDS.sam prokka_Wet2014/Wet2014.ffn Wet2014_1_val_1.fq.gz Wet2014 _2_val_2.fq.gz + bwa mem -t 32 -o samfiles/Dry2014.CDS.sam prokka_Dry2014/Dry2014.ffn Dry2014_1_val_1.fq.gz Dry2014_2_val_2.fq.gz + +Reads are mapped to the ffn files from Prokka. + + +P9. Read mapping to all contigs of each sample (TIMING ~20min) +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +.. code-block:: shell + + $ bwa index megahit_Wet2014/Wet2014.contigs.fa + $ bwa index megahit_Dry2014/Dry2014.contigs.fa + $ bwa mem -t 32 -o samfiles/Wet2014.sam megahit_Wet2014/Wet2014.contigs.fa Wet2014_1_val_1.fq.gz Wet2014_2_val_2.fq.gz + $ bwa mem -t 32 -o samfiles/Dry2014.sam megahit_Dry2014/Dry2014.contigs.fa Dry2014_1_val_1.fq.gz Dry2014_2_val_2.fq.gz + +Reads are mapped to the contig files from MEGAHIT. + + +P10. Sort SAM files by coordinates (TIMING ~8min) + +.. code-block:: shell + + $ cd samfiles + $ samtools sort -@ 32 -o Wet2014.CDS.bam Wet2014.CDS.sam + $ samtools sort -@ 32 -o Dry2014.CDS.bam Dry2014.CDS.sam + $ samtools sort -@ 32 -o Wet2014.bam Wet2014.sam + $ samtools sort -@ 32 -o Dry2014.bam Dry2014.sam + $ rm -rf *sam + $ cd .. + +P11. Read count calculation for all proteins of each sample using Bedtools (TIMING ~2min) +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +.. code-block:: shell + + $ mkdir Wet2014_abund && cd Wet2014_abund + $ seqkit fx2tab -l -n -i ../prokka_Wet2014/Wet2014.ffn | awk '{print $1"\t"$2}' > Wet2014.length + $ seqkit fx2tab -l -n -i ../prokka_Wet2014/Wet2014.ffn | awk '{print $1"\t"0"\t"$2}' > Wet2014.bed + $ bedtools coverage -g Wet2014.length -sorted -a Wet2014.bed -counts -b ../samfiles/Wet2014.CDS.bam > Wet2014.depth.txt + + $ cd .. && mkdir Dry2014_abund && cd Dry2014_abund + $ seqkit fx2tab -l -n -i ../prokka_Dry2014/Dry2014.ffn | awk '{print $1"\t"$2}' > Dry2014.length + $ seqkit fx2tab -l -n -i ../prokka_Dry2014/Dry2014.ffn | awk '{print $1"\t"0"\t"$2}' > Dry2014.bed + $ bedtools coverage -g Dry2014.length -sorted -a Dry2014.bed -counts -b ../samfiles/Dry2014.CDS.bam > Dry2014.depth.txt + $ cd .. + +Read counts are saved in depth.txt files of each sample. + +P12. Read count calculation for a given region of contigs using Samtools (TIMING ~2min) +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +.. code-block:: shell + + $ cd Wet2014_abund + $ samtools index ../samfiles/Wet2014.bam + $ samtools depth -r k141_41392:152403-165349 ../samfiles/Wet2014.bam > Wet2014.cgc.depth.txt + $ cd .. + $ cd Dry2014_abund + $ samtools index ../samfiles/Dry2014.bam + $ samtools depth -r k141_41392:152403-165349 ../samfiles/Dry2014.bam > Dry2014.cgc.depth.txt + +The parameter -r k141_41392:152403-165349 specifies a region in a contig. For any CGC, its positional range can be found in the file cgc_standard.out produced by run_dbcan (Box 6). The depth.txt files contain the raw read counts for the specified region. + +P13. dbcan_utils to calculate the abundance of CAZyme families, subfamilies, CGCs, and substrates (TIMING ~1min) +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +.. code-block:: shell + + $ dbcan_utils CAZyme_abund -bt Wet2014.depth.txt -i ../Wet2014.dbCAN -a TPM + $ dbcan_utils CAZymeSub_abund -bt Wet2014.depth.txt -i ../Wet2014.dbCAN -a TPM + $ dbcan_utils PUL_abund -bt Wet2014.depth.txt -i ../Wet2014.dbCAN -a TPM + $ dbcan_utils PULSub_abund -bt Wet2014.depth.txt -i ../Wet2014.dbCAN -a TPM + + $ cd .. && cd Dry2014_abund + $ dbcan_utils CAZyme_abund -bt Dry2014.depth.txt -i ../Dry2014.dbCAN -a TPM + $ dbcan_utils CAZymeSub_abund -bt Dry2014.depth.txt -i ../Dry2014.dbCAN -a TPM + $ dbcan_utils PUL_abund -bt Dry2014.depth.txt -i ../Dry2014.dbCAN -a TPM + $ dbcan_utils PULSub_abund -bt Dry2014.depth.txt -i ../Dry2014.dbCAN -a TPM + cd .. + +We developed a set of Python scripts as dbcan_utils to take the raw read counts for all CDS as input and output the normalized abundances (Box 8) of CAZyme families, subfamilies, CGCs, and substrates (Figure 4). The parameter -a TPM can also be two other metrics: RPM, or FPKM. + +Box 8. Example output of dbcan_utils +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + + Executing these commands will yield five distinct files for each sample: CAZyme_abund_output, + PUL_abund_output, CAZymeSub_abund_output, PULSub_abund_output.major_voting, and PULSub_abund_output.homo. + These files encompass the abundance of CAZyme, CGC, and substrates within the respective samples, + providing detailed information. Notably, users can conveniently trace back the abundance of these entities. + The abundance calculations adhere to the TPM definition. + +Module 4: dbcan_plot for data visualization (Figure 3) of abundances of CAZymes, CGCs, and substrates (TIMING variable) +----------------------------------------------------------------------------------------------------------------------- + +To visualize the CAZyme annotation result, we provide a set of Python scripts as dbcan_plot to make publication quality plots with the dbcan_utils results as the input. The dbcan_plot scripts can be installed with commands as follows: + +.. code-block:: shell + + $ python3 setup.py install + +In addition to the two abundance folders Wet2014_abund and Dry2014_abund, the two CAZyme annotation folders Wet2014.dbCAN and Dry2014.dbCAN, are also needed well as two abundance folders Wet2014_abund. + +P14. Heatmap for CAZyme substrate abundance across samples (Figure 6A) (TIMING ~xx) +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +.. code-block:: shell + + $ dbcan_plot heatmap_plot --samples Wet2014,Dry2014 -i Wet2014_abund/CAZymeSub_abund_output,Dry2014_abund/CAZymeSub_abund_output --show_abund --top 20 + +Here we plot the top 20 substrates in the two samples. +The input files are the two CAZyme substrate abundance files calculated based on dbCAN-sub result. +The default heatmap is ranked by substrate abundances. +To rank the heatmap according to abundance profile using the xxx clustering algorithm, +users can invoke the `--cluster_map` parameter. + +P15. Barplot for CAZyme substrate abundance across samples (Figure 6B) (TIMING ~xx) +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code-block:: shell + + $ dbcan_plot bar_plot --samples Wet2014,Dry2014 --vertical_bar --top 20 -i Wet2014_abund/CAZyme_abund_output,Dry2014_abund/CAZyme_abund_output + +Users can choose to generate a barplot instead of heatmap using the bar_plot method. + + +P16. Synteny plot between a CGC and its best PUL hit with read mapping coverage to CGC (Figure 6C) (TIMING ~xx) +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +.. code-block:: shell + + $ dbcan_plot CGC_syntenic_with_PUL_abund -i Wet2014.dbCAN --cgcid 'k141_41392|CGC3' --readscount Wet2014_abund/Wet2014.cgc.depth.txt + +The Wet2014.dbCAN folder contains the PUL.out file. Using this file, the cgc_standard.out file, and the best PUL’s gff file in dbCAN-PUL.tar.gz, the CGC_synteny_plot method will create the CGC-PUL synteny plot. The –cgcid parameter is required to specify which CGC to be plotted (‘k141_41392|CGC3' in this example). The Wet2014.cgc.depth.txt file is used to plot the read mapping coverage. + +If users only want to plot the CGC structure: + +.. code-block:: shell + + $ dbcan_plot CGC -i Wet2014.dbCAN --cgcid 'k141_41392|CGC3' + +If users only want to plot the CGC structure plus the read mapping coverage: + +.. code-block:: shell + + $ dbcan_plot CGC_abund -i Wet2014.dbCAN --cgcid 'k141_41392|CGC3' --readscount Wet2014_abund/Wet2014.cgc.depth.txt + +If users only want to plot the synteny between the CGC and PUL: + +.. code-block:: shell + + $ dbcan_plot CGC_syntenic_with_PUL -i Wet2014.dbCAN --cgcid 'k141_41392|CGC3' + +.. warning:: + + The CGC IDs in different samples do not match each other. For example, specifying -i Wet2014.dbCAN is to plot the `k141_41392|CGC3`` in the Wet2014 sample. The `k141_41392|CGC3`` in the Dry2014 sample will be different. diff --git a/docs/user_guide/run_with_CGCFinder.rst b/docs/user_guide/run_with_CGCFinder.rst new file mode 100644 index 000000000..63e9ffe5a --- /dev/null +++ b/docs/user_guide/run_with_CGCFinder.rst @@ -0,0 +1,20 @@ +Run with CGCFinder +================== + +A CAZyme gene cluster (CGC) refers to a group of genes co-located on the genome that are collectively involved in the metabolism of carbohydrates. These gene clusters encode enzymes and other proteins that work together to perform specific functions related to carbohydrate processing. The concept of a CAZyme gene cluster is particularly relevant in the context of microbial genomes, where such clusters often play crucial roles in the utilization of diverse carbohydrate sources. + +Here is an example of how to use run_dbcan to look for CGCs from `Escherichia coli Strain MG1655`_: + +Use `-c cluster` to turn on CGCFinder function for complete genome file: + +.. code-block:: shell + + run_dbcan EscheriaColiK12MG1655.fna prok -c cluster --out_dir output_EscheriaColiK12MG1655 + +Or use `-c EscheriaColiK12MG1655.gff` to turn on CGCFinder function for protein sequence since A GFF or BED format file with gene position information is required to run CGCFinder when using a protein input. + +.. code-block:: shell + + run_dbcan EscheriaColiK12MG1655.faa protein -c EscheriaColiK12MG1655.gff --out_dir output_EscheriaColiK12MG1655 + +.. _Escherichia coli Strain MG1655: https://www.ncbi.nlm.nih.gov/nuccore/U00096.2 diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 000000000..65993e880 --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,179 @@ +[build-system] +build-backend = "hatchling.build" +requires = ["hatchling"] + +[project] +name = "dbcan" +version = "4.1.0" +description = "Standalone version of dbCAN annotation tool for automated CAZyme annotation" +readme = "README.md" +requires-python = ">=3.6" +license = {file = "LICENSE"} +authors = [ + {name = "Le Huang"}, + {name = "Jinfang Zheng"}, + {name = "Haidong Yi"}, + {name = "Qiwei Ge"}, + {name = "Tanner Yohe"}, +] +maintainers = [ + {name = "Le Huang", email = "lehuang@unc.edu"}, + {name = "Haidong Yi", email = "haidyi@cs.unc.edu"}, +] +dependencies = [ + "numpy>1.19", + "scipy", + "pandas", + "biopython", + # for debug logging (referenced from the issue template) + "session-info" +] +classifiers = [ + "Development Status :: 4 - Beta", + "Environment :: Console", + "Intended Audience :: Science/Research", + "License :: OSI Approved :: GNU General Public License v3 (GPLv3)", + "Operating System :: POSIX :: Linux", + "Programming Language :: Python :: 3", + "Topic :: Scientific/Engineering :: Bio-Informatics" +] + +[project.urls] +Documentation = "https://dbcan.readthedocs.io/" +Source = "https://github.com/linnabrown/run_dbcan/" +Home-page = "https://bcb.unl.edu/dbCAN2/" + +[project.scripts] +run_dbcan = "dbcan.cli.run_dbcan:cli_main" +cgc_standard2json = "dbcan.cli.cgc_process_json:main" +syntenic_plot = "dbcan.cli.syntenic_plot:main" +dbcan_utils = "dbcan.utils.utils:main" +dbcan_plot = "dbcan.utils.plots:main" +dbcan_asmfree = "dbcan.utils.diamond_unassembly:main" + +[project.optional-dependencies] +dev = [ + "pre-commit", + "twine>=4.0.2" +] +doc = [ + "docutils>=0.8,!=0.18.*,!=0.19.*", + "sphinx>=4", + "sphinx-book-theme>=1.0.0", + "myst-nb", + "sphinxcontrib-bibtex>=1.0.0", + "sphinx-autodoc-typehints", + "sphinxext-opengraph", + # For notebooks + "ipykernel", + "ipython", + "sphinx-copybutton", + "sphinx-issues", + "sphinx-design", + "sphinx-argparse", +] +test = [ + "pytest", + "pytest-cov", +] + +[tool.coverage.run] +source = ["dbcan"] +omit = [ + "**/test_*.py", +] + +[tool.pytest.ini_options] +testpaths = ["tests"] +xfail_strict = true +addopts = [ + "--import-mode=importlib", # allow using test files with same name +] + +[tool.black] +line-length = 120 + +[tool.ruff] +src = ["dbcan"] +line-length = 120 +select = [ + "F", # Errors detected by Pyflakes + "E", # Error detected by Pycodestyle + "W", # Warning detected by Pycodestyle + "I", # isort + "D", # pydocstyle + "B", # flake8-bugbear + "TID", # flake8-tidy-imports + "C4", # flake8-comprehensions + "BLE", # flake8-blind-except + "UP", # pyupgrade + "RUF100", # Report unused noqa directives +] +ignore = [ + # line too long -> we accept long comment lines; black gets rid of long code lines + "E501", + # Do not assign a lambda expression, use a def -> lambda expression assignments are convenient + "E731", + # allow I, O, l as variable names -> I is the identity matrix + "E741", + # Missing docstring in public package + "D104", + # Missing docstring in public module + "D100", + # Missing docstring in __init__ + "D107", + # Errors from function calls in argument defaults. These are fine when the result is immutable. + "B008", + # __magic__ methods are are often self-explanatory, allow missing docstrings + "D105", + # first line should end with a period [Bug: doesn't work with single-line docstrings] + "D400", + # First line should be in imperative mood; try rephrasing + "D401", + ## Disable one in each pair of mutually incompatible rules + # We don’t want a blank line before a class docstring + "D203", + # We want docstrings to start immediately after the opening triple quote + "D213", +] + +[tool.ruff.pydocstyle] +convention = "numpy" + +[tool.ruff.per-file-ignores] +"docs/*" = ["I"] +"tests/*" = ["D"] +"*/__init__.py" = ["F401"] + +[tool.cruft] +skip = [ + "tests", + "dbcan/**/__init__.py", + "docs/api.md", + "docs/changelog.md", + "docs/references.bib", + "docs/references.md", +] + +[tool.pyright] +# venv path and venv name +venvPath = "." +venv = ".venv" + +# set project src and include +root = "." + +include = "./dbcan/**/*.py" +exclude = [ + "tests/", + ".venv/", + "build/" +] + +# report missing imports +reportMissingImports = true + +# report unused imports +reportUnusedImport = true + +typeCheckingMode = "basic" diff --git a/setup.cfg b/setup.cfg deleted file mode 100644 index 5daf14f28..000000000 --- a/setup.cfg +++ /dev/null @@ -1,3 +0,0 @@ -[egg_info] -tag_build = -tag_date = 0 \ No newline at end of file diff --git a/setup.py b/setup.py deleted file mode 100755 index 456210902..000000000 --- a/setup.py +++ /dev/null @@ -1,56 +0,0 @@ -#!/usr/bin/env python -# -# distutils setup script for dbcan package -# also installs the bundled Hotpep scripts and data files - -from glob import glob -from os import listdir -from os.path import isfile -from setuptools import setup, find_packages - -long_description = """This is the standalone version of dbCAN annotation tool for automated CAZyme annotation (known as run_dbCAN.py), written by Tanner Yohe and Le Huang. -""" - -setup(name='dbcan', - # The version number here is the single source in the package - version="3.0.7", - description='Standalone version of dbCAN annotation tool for automated CAZyme annotation', - long_description=long_description, - author='Tanner Yohe, Le Huang, Qiwei Ge, and Haidong Yi', - author_email='lehuang@unc.edu', - url='https://github.com/linnabrown/run_dbcan', - classifiers=[ - 'Development Status :: 4 - Beta', - 'Environment :: Console', - 'Intended Audience :: Science/Research', - 'License :: OSI Approved :: GNU General Public License v3 (GPLv3)', - 'Operating System :: POSIX :: Linux', - 'Programming Language :: Python :: 3', - 'Topic :: Scientific/Engineering :: Bio-Informatics', - ], - packages=find_packages( - exclude=[ - "db", - "db.*" - ] - ) + ['dbcan_cli'], - include_package_data=True, - scripts=[ - 'dbcan_cli/hmmscan_parser.py' - ], - entry_points={ - "console_scripts":[ - "run_dbcan = dbcan_cli.run_dbcan:cli_main", - ] - }, - license='GPLv3', - install_requires=[ - 'natsort', - 'setuptools', - 'scipy', - 'psutil', - 'numpy' - ], - python_requires='>=3.5', - zip_safe=False - )