Skip to content

Commit

Permalink
Update patch gtf script to be compatible with the rnaseq pipeline
Browse files Browse the repository at this point in the history
  • Loading branch information
Vangelis Theodorakis committed Aug 25, 2023
1 parent 8ed6ea2 commit c2a3cb1
Show file tree
Hide file tree
Showing 4 changed files with 105 additions and 15 deletions.
6 changes: 3 additions & 3 deletions .ipynb_checkpoints/nextflow-checkpoint.config
Original file line number Diff line number Diff line change
Expand Up @@ -9,14 +9,14 @@
// Global default params, used in configs
params {
gencodeVersion = 44
spliceJunctionOverhang = 100 // Length of reads - 1
star_output_zip = "STARv2710a_genome_GRCh38_noALT_noHLA_noDecoy_ERCC_v44_oh100"
spliceJunctionOverhang = 74 // Length of reads - 1
star_output_zip = "STARv279a_genome_GRCh38_noALT_noHLA_noDecoy_ERCC_v44_oh74"
rsem_output_zip = "rsem_reference_GRCh38_gencode44_ercc"
executor = "slurm"
publish_intermediate_data = false

// Boilerplate options
outdir = "rnaseq-hg38-gencode.v44-bundle"
outdir = "geuvadis-rnaseq-hg38-gencode.v44-bundle"
publish_dir_mode = 'copy'
input = null
email = null
Expand Down
70 changes: 70 additions & 0 deletions bin/.ipynb_checkpoints/patch_ercc_spikes_gtf-checkpoint.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,70 @@
#!/usr/bin/env python3

import sys
import argparse

def main(argv):
ercc_gtf_file = ''
patched_ercc_gtf_file = ''
parser = argparse.ArgumentParser(description='Add gene- and transcript-level attributes to the ERCC GTF files (original and patched)')
parser.add_argument('--ercc_gtf_file', help='The ERCC spike gtf file')
parser.add_argument('--patched_ercc_gtf_file', help='The patched ERCC spike gtf file')
args = parser.parse_args(argv)

ercc_gtf_file = args.ercc_gtf_file
patched_ercc_gtf_file = args.patched_ercc_gtf_file

with open(ercc_gtf_file) as exon_gtf, open(patched_ercc_gtf_file, 'w') as gene_gtf:
for line in exon_gtf:
f = line.strip().split('\t')
f[0] = f[0].replace('-', '_') # required for RNA-SeQC/GATK (no '-' in contig name)
f[5] = '.'

attr = f[8]
if attr[-1] == ';':
attr = attr[:-1]
attr = dict([i.split(' ') for i in attr.replace('"', '').split('; ')])
# add gene_name, gene_type
attr['gene_name'] = attr['gene_id']
attr['gene_type'] = 'ercc_control'
attr['gene_status'] = 'KNOWN'
attr['level'] = 2
for k in ['id', 'type', 'name', 'status']:
attr[f'transcript_{k}'] = attr[f'gene_{k}']

# Prepare the gene attributes
gene_attr_str = []
for k in ['gene_id', 'gene_type', 'gene_name', 'gene_status']:
gene_attr_str.append(f'{k} "{attr[k]}";')
gene_attr_str.append(f"level {attr['level']};")


# Prepare the transcript attributes
transcript_attr_str = []
for k in ['gene_id', 'transcript_id', 'gene_type', 'gene_name', 'transcript_type', 'transcript_name', 'gene_status', 'transcript_status']:
transcript_attr_str.append(f'{k} "{attr[k]}";')
transcript_attr_str.append(f"level {attr['level']};")

# Prepare the exon attributes
exon_attr_str = []
for k in ['gene_id', 'transcript_id', 'gene_type', 'gene_name', 'transcript_type', 'transcript_name', 'gene_status', 'transcript_status']:
exon_attr_str.append(f'{k} "{attr[k]}";')
exon_attr_str.append(f"level {attr['level']};")

# write gene, transcript, exon

# Set 9th element to the gene attributes
f[8] = ' '.join(gene_attr_str)
gene_gtf.write('\t'.join(f[:2] + ['gene'] + f[3:]) + '\n')

# Set 9th element to the transcript attributes
f[8] = ' '.join(transcript_attr_str)
gene_gtf.write('\t'.join(f[:2] + ['transcript'] + f[3:]) + '\n')

#f[8] = ' '.join(attr_str[:2])
f[8] = ' '.join(exon_attr_str)
gene_gtf.write('\t'.join(f[:2] + ['exon'] + f[3:]) + '\n')


if __name__ == '__main__':
main(sys.argv[1:])
38 changes: 29 additions & 9 deletions bin/patch_ercc_spikes_gtf.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,18 +31,38 @@ def main(argv):
attr['level'] = 2
for k in ['id', 'type', 'name', 'status']:
attr[f'transcript_{k}'] = attr[f'gene_{k}']

attr_str = []
for k in ['gene_id', 'transcript_id', 'gene_type', 'gene_status', 'gene_name',
'transcript_type', 'transcript_status', 'transcript_name']:
attr_str.append(f'{k} "{attr[k]}";')
attr_str.append(f"level {attr['level']};")
f[8] = ' '.join(attr_str)


# Prepare the gene attributes
gene_attr_str = []
for k in ['gene_id', 'gene_type', 'gene_name', 'gene_status']:
gene_attr_str.append(f'{k} "{attr[k]}";')
gene_attr_str.append(f"level {attr['level']};")


# Prepare the transcript attributes
transcript_attr_str = []
for k in ['gene_id', 'transcript_id', 'gene_type', 'gene_name', 'transcript_type', 'transcript_name', 'gene_status', 'transcript_status']:
transcript_attr_str.append(f'{k} "{attr[k]}";')
transcript_attr_str.append(f"level {attr['level']};")

# Prepare the exon attributes
exon_attr_str = []
for k in ['gene_id', 'transcript_id', 'gene_type', 'gene_name', 'transcript_type', 'transcript_name', 'gene_status', 'transcript_status']:
exon_attr_str.append(f'{k} "{attr[k]}";')
exon_attr_str.append(f"level {attr['level']};")

# write gene, transcript, exon

# Set 9th element to the gene attributes
f[8] = ' '.join(gene_attr_str)
gene_gtf.write('\t'.join(f[:2] + ['gene'] + f[3:]) + '\n')

# Set 9th element to the transcript attributes
f[8] = ' '.join(transcript_attr_str)
gene_gtf.write('\t'.join(f[:2] + ['transcript'] + f[3:]) + '\n')
f[8] = ' '.join(attr_str[:2])

#f[8] = ' '.join(attr_str[:2])
f[8] = ' '.join(exon_attr_str)
gene_gtf.write('\t'.join(f[:2] + ['exon'] + f[3:]) + '\n')


Expand Down
6 changes: 3 additions & 3 deletions nextflow.config
Original file line number Diff line number Diff line change
Expand Up @@ -9,14 +9,14 @@
// Global default params, used in configs
params {
gencodeVersion = 44
spliceJunctionOverhang = 100 // Length of reads - 1
star_output_zip = "STARv2710a_genome_GRCh38_noALT_noHLA_noDecoy_ERCC_v44_oh100"
spliceJunctionOverhang = 74 // Length of reads - 1
star_output_zip = "STARv279a_genome_GRCh38_noALT_noHLA_noDecoy_ERCC_v44_oh74"
rsem_output_zip = "rsem_reference_GRCh38_gencode44_ercc"
executor = "slurm"
publish_intermediate_data = false

// Boilerplate options
outdir = "rnaseq-hg38-gencode.v44-bundle"
outdir = "geuvadis-rnaseq-hg38-gencode.v44-bundle"
publish_dir_mode = 'copy'
input = null
email = null
Expand Down

0 comments on commit c2a3cb1

Please sign in to comment.