-
Notifications
You must be signed in to change notification settings - Fork 3
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Add script to parse the Complex Portal data file
Refs #1166
- Loading branch information
1 parent
aea53a3
commit 1ae2cbe
Showing
1 changed file
with
92 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,92 @@ | ||
#!/usr/bin/env perl | ||
|
||
# Parse the pombe data file from the Complex Portal FTP site | ||
# http://ftp.ebi.ac.uk/pub/databases/intact/complex/current/complextab/284812.tsv | ||
# and write a file of pombe ID to Complex Portal ID mappings | ||
# and a file of Complex Portal IDs and complex names | ||
|
||
use strict; | ||
use warnings; | ||
use Carp; | ||
|
||
if (@ARGV != 5) { | ||
die qq|$0: needs 5 arguments: | ||
Input: | ||
- PomBase2UniProt.tsv (pombe-embl/ftp_site/pombe/names_and_identifiers/) | ||
- complex_portal_pombe_data.tsv (from http://ftp.ebi.ac.uk/pub/databases/intact/complex/current/complextab/284812.tsv) | ||
- Complex_Portal_PubMed_ID (probably "PMID:30357405") | ||
Output: | ||
- pombe_to_complex_id_mapping.tsv | ||
- complex_id_and_names.tsv | ||
|; | ||
} | ||
|
||
|
||
my $uniprot_mapping_filename = shift; | ||
|
||
open my $uniprot_mapping, '<', $uniprot_mapping_filename | ||
or die "can't open $uniprot_mapping_filename: $?"; | ||
|
||
my %uniprot_map = (); | ||
|
||
while (defined (my $line = <$uniprot_mapping>)) { | ||
next if $line =~ /^#/; | ||
|
||
chomp $line; | ||
|
||
my ($pombe_id, $uniprot_id) = split /\t/, $line; | ||
|
||
$uniprot_map{$uniprot_id} = $pombe_id; | ||
} | ||
|
||
close $uniprot_mapping; | ||
|
||
|
||
my $complex_portal_filename = shift; | ||
|
||
open my $complex_portal_file, '<', $complex_portal_filename | ||
or die "can't open $complex_portal_filename: $?"; | ||
|
||
my $complex_portal_pubmed_id = shift; | ||
|
||
my $pombe_to_complex_id_mapping_filename = shift; | ||
|
||
open my $pombe_to_complex_id_mapping_file, '>', $pombe_to_complex_id_mapping_filename | ||
or die "can't open $pombe_to_complex_id_mapping_filename: $?"; | ||
|
||
my $complex_ids_and_names_filename = shift; | ||
|
||
open my $complex_ids_and_names_file, '>', $complex_ids_and_names_filename | ||
or die "can't open $complex_ids_and_names_filename: $?"; | ||
|
||
|
||
while (defined (my $line = <$complex_portal_file>)) { | ||
next if $line =~ /^#/; | ||
|
||
chomp $line; | ||
|
||
my ($complex_portal_acc, $complex_name, $aliases, $taxon, $identifers, $evidence) = | ||
split /\t/, $line; | ||
|
||
print $complex_ids_and_names_file "$complex_portal_acc\t$complex_name\n"; | ||
|
||
for my $id_details (split /\|/, $identifers) { | ||
if ($id_details =~ /(.*)\(.*\)/) { | ||
if ($id_details !~ /CHEBI:\d+/) { | ||
my $pombe_id = $uniprot_map{$1}; | ||
|
||
if (defined $pombe_id) { | ||
print $pombe_to_complex_id_mapping_file "$pombe_id\t$complex_portal_acc\t$complex_portal_pubmed_id\n"; | ||
} else { | ||
die "$complex_portal_filename:$.: can't find pombe ID for $1\n"; | ||
} | ||
} | ||
} else { | ||
die "$complex_portal_filename:$.: can't parse ID details: $id_details\n"; | ||
} | ||
} | ||
} | ||
|
||
close $complex_portal_file or die; | ||
close $pombe_to_complex_id_mapping_file or die; | ||
close $complex_ids_and_names_file or die; |