-
Notifications
You must be signed in to change notification settings - Fork 2
/
gff.py
42 lines (37 loc) · 1.46 KB
/
gff.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
#!/usr/bin/python
import pandas as pd
def gff(gff_file,rna_type):
gff_file = open(gff_file).read().split('\n')
df = pd.DataFrame()
if rna_type == 'miRNA':
gff_file = gff_file[13:-1]
chr_loc = [c.split('\t')[0] for c in gff_file]
mirna_type = [c.split('\t')[2] for c in gff_file]
chr_start = [c.split('\t')[3] for c in gff_file]
chr_end = [c.split('\t')[4] for c in gff_file]
strand = [c.split('\t')[6] for c in gff_file]
mir_id = [c.split('\t')[8].split(';')[-2].split('=')[-1] for c in gff_file]
# df = pd.DataFrame()
df['mir_id'] = mir_id
df['chr'] = chr_loc
df['chr_start'] = chr_start
df['chr_end'] = chr_end
df['strand'] = strand
df['mirna_type'] = mirna_type
df = df.set_index('mir_id')
df = df[~df['mirna_type'].str.contains('miRNA_primary_transcript')]
df = df.drop('mirna_type',axis=1)
else:
gff_file = gff_file[1:-1]
rna_id = [c.split('\t')[-1] for c in gff_file]
chr_loc = [c.split('\t')[0] for c in gff_file]
chr_start = [c.split('\t')[3] for c in gff_file]
chr_end = [c.split('\t')[4] for c in gff_file]
strand = [c.split('\t')[6] for c in gff_file]
df['rna_id'] = rna_id
df['chr'] = chr_loc
df['chr_start'] = chr_start
df['chr_end'] = chr_end
df['strand'] = strand
df = df.set_index('rna_id')
return df