Skip to content

Commit

Permalink
custom list debug
Browse files Browse the repository at this point in the history
  • Loading branch information
sigven committed Dec 20, 2023
1 parent 3b7e992 commit 98c1c43
Show file tree
Hide file tree
Showing 2 changed files with 21 additions and 22 deletions.
29 changes: 13 additions & 16 deletions pcgr/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -208,8 +208,7 @@ def populate_config_data(conf_options: dict, db_dir: str, workflow = "PCGR", log
conf_data['conf']['sample_properties']['phenotype'] = 'None'

if workflow == "CPSR":
if conf_data['conf']['gene_panel']['panel_id'] is not None and \
conf_data['conf']['gene_panel']['diagnostic_grade_only'] is not None:
if conf_data['conf']['gene_panel']['panel_id'] is not None:

if conf_data['conf']['gene_panel']['panel_id'] == "-1":
conf_data['conf']['gene_panel']['description'] = 'User-defined panel (custom geneset from panel 0)'
Expand All @@ -227,15 +226,15 @@ def populate_config_data(conf_options: dict, db_dir: str, workflow = "PCGR", log
db_dir,
conf_data['genome_assembly'],
conf_data['conf']['gene_panel']['diagnostic_grade_only'],
conf_data['conf']['gene_panel']['custom_list_bed'],
conf_data['conf']['gene_panel']['custom_list_tsv'],
bool(conf_data['conf']['variant_classification']['secondary_findings']),
logger)


return(conf_data)

def set_virtual_target_genes(panel_id: str, db_dir: str, genome_assembly: str, diagnostic_grade_only: bool,
custom_list_bed: str, secondary_findings: bool, logger=None):
custom_list_tsv: str, secondary_findings: bool, logger=None):

all_panels_fname = os.path.join(
db_dir, "data", genome_assembly,
Expand Down Expand Up @@ -292,15 +291,12 @@ def set_virtual_target_genes(panel_id: str, db_dir: str, genome_assembly: str, d


if panel_id == "-1":
if not custom_list_bed == 'None':
if not custom_list_tsv == 'None':
custom_ensembl_gene_ids = {}
if check_file_exists(custom_list_bed, logger):
with open(custom_list_bed) as f:
reader = csv.reader(f, delimiter='\t')
for row in reader:
ensembl_gene_id = row[3].split('|')[1]
custom_ensembl_gene_ids[ensembl_gene_id] = 1
f.close()
if check_file_exists(custom_list_tsv, logger):
custom_genes = csv.DictReader(open(custom_list_tsv,'r'), delimiter='\n', fieldnames=['ensembl_gene_id'])
for row in custom_genes:
custom_ensembl_gene_ids[row['ensembl_gene_id']] = 1

panel_targets = all_virtual_panels[all_virtual_panels['id'] == "0"].copy()
panel_targets = panel_targets[panel_targets['ensembl_gene_id'].isin(custom_ensembl_gene_ids)]
Expand All @@ -310,10 +306,11 @@ def set_virtual_target_genes(panel_id: str, db_dir: str, genome_assembly: str, d
error_message(err_msg, logger)
else:
panel_targets.loc[:,'confidence_level'] = -1
panel_targets.loc[:,'panel_id'] = None
panel_targets.loc[:,'panel_url'] = None
panel_targets.loc[:,'panel_version'] = None
panel_targets.loc[:,'panel_id'] = -5
panel_targets.loc[:,'panel_url'] = 'None'
panel_targets.loc[:,'panel_version'] = 1.0
panel_targets.loc[:,'panel_name'] = "CustomPanel"
panel_targets.loc[:,'primary_target'] = True
for f in ['panel_url', 'panel_name','moi','mod','symbol','ensembl_gene_id']:
panel_targets[f] = panel_targets[f].astype(str)

Expand All @@ -332,7 +329,7 @@ def set_virtual_target_genes(panel_id: str, db_dir: str, genome_assembly: str, d

all_targets = panel_targets

if len(all_secondary_finding_targets) > 0:
if len(all_secondary_finding_targets) > 0:
all_targets = pd.concat([panel_targets, all_secondary_finding_targets], axis=0)

return all_targets.to_dict(orient='records')
Expand Down
14 changes: 8 additions & 6 deletions scripts/cpsr_validate_input.py
Original file line number Diff line number Diff line change
Expand Up @@ -112,17 +112,19 @@ def get_valid_custom_genelist(genelist_fname, genelist_bed_fname, pcgr_dir, geno

## Add custom set of genes to target BED
logger.info('Creating BED file with custom target genes: ' + str(genelist_bed_fname))
id_pat = '|'.join([f"\|{g}\|" for g in valid_custom_identifiers])
id_pat = '|'.join([f"{g}" for g in valid_custom_identifiers])

id_pat_ext = id_pat + '|(\|tag\|)|' + '(\|ACMG_SF\|)'
id_pat_ext = id_pat + '|(\|tag\|)|' + 'ACMG_SF'
awk_command = "awk 'BEGIN{FS=\"\\t\"}{if($4 !~ /ACMG_SF/ || ($4 ~ /ACMG_SF/ && $4 ~ /" + '|'.join(valid_custom_identifiers) + "/))print;}'"
cmd_target_regions_bed = f"bgzip -dc {virtualpanel_track_bed} | egrep '{id_pat_ext}' > {genelist_bed_fname_unsorted}"
if gwas_findings == 0 and secondary_findings == 1:
cmd_target_regions_bed = f"bgzip -dc {virtualpanel_track_bed} | egrep '{id_pat}' | egrep -v '(\|tag\|)' > {genelist_bed_fname_unsorted}"
cmd_target_regions_bed = f"bgzip -dc {virtualpanel_track_bed} | egrep '{id_pat_ext}' | egrep -v '(\|tag\|)' > {genelist_bed_fname_unsorted}"
if gwas_findings == 0 and secondary_findings == 0:
cmd_target_regions_bed = f"bgzip -dc {virtualpanel_track_bed} | egrep '{id_pat}' | egrep -v '(\|tag\|)|(\ACMG_SF\|)' > {genelist_bed_fname_unsorted}"
cmd_target_regions_bed = f"bgzip -dc {virtualpanel_track_bed} | egrep '{id_pat_ext}' | egrep -v '(\|tag\|)' | {awk_command} > {genelist_bed_fname_unsorted}"
if gwas_findings == 1 and secondary_findings == 0:
cmd_target_regions_bed = f"bgzip -dc {virtualpanel_track_bed} | egrep '{id_pat}' | egrep -v '(\ACMG_SF\|)' > {genelist_bed_fname_unsorted}"
cmd_target_regions_bed = f"bgzip -dc {virtualpanel_track_bed} | egrep '{id_pat_ext}' | {awk_command} > {genelist_bed_fname_unsorted}"

#print(cmd_target_regions_bed)
check_subprocess(logger, cmd_target_regions_bed, debug)

## Sort regions in target BED
Expand Down Expand Up @@ -218,7 +220,7 @@ def simplify_vcf(input_vcf, validated_vcf, vcf, custom_bed, pcgr_directory, geno

## Concatenate all panel BEDs to one big virtual panel BED, sort and make unique
panel_ids = str(virtual_panel_id).split(',')
for pid in panel_ids:
for pid in set(panel_ids):
ge_panel_identifier = "GE_PANEL_" + str(pid)
target_bed_gz = os.path.join(
pcgr_directory,'data',genome_assembly, 'gene','bed','gene_virtual_panel', str(pid) + ".bed.gz")
Expand Down

0 comments on commit 98c1c43

Please sign in to comment.