DOKK / manpages / debian 11 / libgenome-model-tools-music-perl / Genome::Model::Tools::Music::PathScan.3pm.en
Genome::Model::Tools::Music::PathScan(3pm) User Contributed Perl Documentation Genome::Model::Tools::Music::PathScan(3pm)

For example, a line in the pathway-file would look like:

  hsa00061    Fatty acid biosynthesis    Lipid Metabolism    31:ACACA|32:ACACB|27349:MCAT|2194:FASN|54995:OXSM|55301:OLAH
    

Ensure that the gene names and entrez IDs used match those used in the MAF file. Entrez IDs are not mandatory (use a 0 if Entrez ID unknown). But if a gene name in the MAF does not match any gene name in this file, the entrez IDs are used to find a match (unless it's a 0).

EOS
); }

sub _doc_authors {
return <<EOS
Michael Wendl, Ph.D. EOS }

sub _doc_credits {
return <<EOS This module uses reformatted copies of data from the Kyoto Encyclopedia of Genes and Genomes (KEGG) database:

 * KEGG - http://www.genome.jp/kegg/
EOS
}

sub execute {
my $self = shift;
my $covg_dir = $self->gene_covg_dir;
my $bam_list = $self->bam_list;
my $pathway_file = $self->pathway_file;
my $maf_file = $self->maf_file;
my $output_file = $self->output_file;
my $bgd_mut_rate = $self->bmr;
my $genes_to_ignore = $self->genes_to_ignore;
my $min_mut_genes_per_path = $self->min_mut_genes_per_path;
my $skip_non_coding = $self->skip_non_coding;
my $skip_silent = $self->skip_silent;

    # Check on all the input data before starting work
    print STDERR "MAF file not found or is empty: $maf_file\n" unless( -s $maf_file );
    print STDERR "Directory with gene coverages not found: $covg_dir\n" unless( -e $covg_dir );
    print STDERR "List of samples not found or is empty: $bam_list\n" unless( -s $bam_list );
    print STDERR "Pathway info file not found or is empty: $pathway_file\n" unless( -s $pathway_file );
    return undef unless( -s $maf_file && -e $covg_dir && -s $bam_list && -s $pathway_file );
    # Build a hash to quickly lookup the genes whose mutations should be ignored
    my %ignored_genes = ();
    if( defined $genes_to_ignore )
    {
        %ignored_genes = map { $_ => 1 } split( /,/, $genes_to_ignore );
    }
    # PathScan uses a helluva lot of hashes - all your RAM are belong to it
    my %sample_gene_hash; # sample => array of genes (based on maf)
    my %gene_path_hash; # gene => array of pathways (based on path_file)
    my %path_hash; # pathway => all the information about the pathways in the database
    my %sample_path_hash; # sample => pathways (based on %sample_gene_hash and %gene_path_hash)
    my %path_sample_hits_hash; # path => sample => hits,mutated_genes
    my %gene_sample_cov_hash; # gene => sample => coverage
    my @all_sample_names; # names of all the samples, no matter if it's mutated or not
    my %id_gene_hash; # entrez id => gene (based on first two columns in MAF)
    # Parse out the names of the samples which should match the names in the MAF file
    my $sampleFh = IO::File->new( $bam_list ) or die "Couldn't open $bam_list. $!\n";
    while( my $line = $sampleFh->getline )
    {
        next if ( $line =~ m/^#/ );
        chomp( $line );
        my ( $sample ) = split( /\t/, $line );
        push( @all_sample_names, $sample );
    }
    $sampleFh->close;
    # Read coverage data calculated by the Music::Bmr::CalcCovg
    $covg_dir =~ s/(\/)+$//; # Remove trailing forward slashes if any
    read_CoverageFiles( $covg_dir, \@all_sample_names, \%gene_sample_cov_hash );
    #build gene => average_coverage hash for population test
    my %gene_cov_hash;
    foreach my $gene ( keys %gene_sample_cov_hash )
    {
        my $total_cov = 0;
        my $sample_num = scalar( @all_sample_names );
        $total_cov += $gene_sample_cov_hash{$gene}{$_} foreach( @all_sample_names );
        $gene_cov_hash{$gene} = int( $total_cov / $sample_num );
    }
    #build %sample_gene_hash based on maf
    my $maf_fh = IO::File->new( $maf_file );
    while( my $line = $maf_fh->getline )
    {
        next if( $line =~ m/^(#|Hugo_Symbol)/ );
        chomp( $line );
        my @cols = split( /\t/, $line );
        my ( $gene, $entrez_id, $mutation_class, $tumor_sample ) = ( $cols[0], $cols[1], $cols[8], $cols[15] );
        # If the mutation classification is odd, quit with error
        if( $mutation_class !~ m/^(Missense_Mutation|Nonsense_Mutation|Nonstop_Mutation|Splice_Site|Translation_Start_Site|Frame_Shift_Del|Frame_Shift_Ins|In_Frame_Del|Silent|In_Frame_Ins|Intron|RNA|3'Flank|3'UTR|5'Flank|5'UTR|IGR|Targeted_Region|De_novo_Start_InFrame|De_novo_Start_OutOfFrame)$/ )
        {
            print STDERR "Unrecognized Variant_Classification $mutation_class in MAF file.\n";
            print STDERR "Please use TCGA MAF Specification v2.3.\n";
            return undef;
        }
        # If user wants, skip Silent mutations, or those in Introns, RNA, UTRs, Flanks, IGRs, or the ubiquitous Targeted_Region
        if(( $skip_non_coding && $mutation_class =~ m/^(Intron|RNA|3'Flank|3'UTR|5'Flank|5'UTR|IGR|Targeted_Region)$/ ) ||
          ( $skip_silent && $mutation_class =~ m/^Silent$/ ))
        {
            print STDERR "Skipping $mutation_class mutation in gene $gene.\n";
            next;
        }
        # Check that the user followed instructions and named each sample correctly
        unless( grep( /^$tumor_sample$/, @all_sample_names ))
        {
            print STDERR "Sample $tumor_sample in MAF file does not match any in $bam_list\n";
            return undef;
        }
        next if( defined $ignored_genes{$gene} ); # Ignore variants in genes that user wants ignored
        $id_gene_hash{$entrez_id} = $gene unless( $entrez_id eq '' or $entrez_id == 0 or $entrez_id !~ m/^\d+$/ );
        push( @{$sample_gene_hash{$tumor_sample}}, $gene ) unless( grep /^$gene$/, @{$sample_gene_hash{$tumor_sample}} );
    }
    $maf_fh->close;
    my $path_fh = IO::File->new( $pathway_file );
    while( my $line = $path_fh->getline )
    {
        chomp( $line );
        next if( $line =~ /^(#|ID)/ ); #Skip headers
        my ( $path_id, $name, $class, $gene_line, $diseases, $drugs, $description ) = split( /\t/, $line );
        my @genes = split( /\|/, $gene_line ); #Each gene is in the format "EntrezID:GeneSymbol"
        $diseases =~ s/\|/, /g; #Change the separators to commas
        $drugs =~ s/\|/, /g; #Change the separators to commas
        $path_hash{$path_id}{name} = $name unless( $name eq '' );
        $path_hash{$path_id}{class} = $class unless( $class eq '' );
        $path_hash{$path_id}{diseases} = $diseases unless( $diseases eq '' );
        $path_hash{$path_id}{drugs} = $drugs unless( $drugs eq '' );
        $path_hash{$path_id}{description} = $description unless( $description eq '' );
        @{$path_hash{$path_id}{gene}} = ();
        foreach my $gene ( @genes )
        {
            my ( $entrez_id, $gene_symbol ) = split( /:/, $gene );
            unless( $entrez_id eq '' or $entrez_id == 0 or $entrez_id !~ m/^\d+$/ )
            {
                # Use the gene name from the MAF file if the entrez ID matches
                $gene_symbol = $id_gene_hash{$entrez_id} if( defined $id_gene_hash{$entrez_id} );
            }
            push( @{$gene_path_hash{$gene_symbol}}, $path_id ) unless( grep /^$path_id$/, @{$gene_path_hash{$gene_symbol}} );
            unless( grep /^$gene_symbol$/, @{$path_hash{$path_id}{gene}} )
            {
                push( @{$path_hash{$path_id}{gene}}, $gene_symbol );
            }
        }
    }
    $path_fh->close;
    #build a sample => pathway hash
    foreach my $sample ( keys %sample_gene_hash )
    {
        foreach my $gene ( @{$sample_gene_hash{$sample}} )
        {
            if( defined $gene_path_hash{$gene} )
            {
                foreach my $pathway ( @{$gene_path_hash{$gene}} )
                {
                    push( @{$sample_path_hash{$sample}}, $pathway ) unless( grep /^$pathway$/, @{$sample_path_hash{$sample}} );
                }
            }
        }
    }
    #build path_sample_hits_hash, for population test
    foreach my $sample ( keys %sample_path_hash )
    {
        foreach my $path ( @{$sample_path_hash{$sample}} )
        {
            my $hits = 0;
            my @mutated_genes = (); #Mutated genes in this sample belonging to this pathway
            my @mutated_genes_in_sample = @{$sample_gene_hash{$sample}};
            foreach my $gene ( @{$path_hash{$path}{gene}} )
            {
                if( grep /^$gene$/, @mutated_genes_in_sample ) #if this gene is mutated in this sample (in maf)
                {
                    $hits++;
                    push( @mutated_genes, $gene );
                }
            }
            if( $hits > 0 )
            {
                $path_sample_hits_hash{$path}{$sample}{hits} = $hits;
                $path_sample_hits_hash{$path}{$sample}{mutated_genes} = \@mutated_genes;
            }
        }
    }
    #Calculation of p value
    my %data; #For printing
    my @pvals;
    foreach my $path ( sort keys %path_hash )
    {
        my @pathway_genes = @{$path_hash{$path}{gene}};
        my @gene_sizes = ();
        foreach my $gene ( @pathway_genes )
        {
            if( defined $gene_cov_hash{$gene} )
            {
                my $avg_cov = int( $gene_cov_hash{$gene} );
                push( @gene_sizes, $avg_cov ) if( $avg_cov > 3 );
            }
        }
        #If this pathway doesn't have any gene coverage, skip it
        next unless( scalar( @gene_sizes ) > 0 );
        my @num_hits_per_sample; #store hits info for each patient
        my @mutated_samples = sort keys %{$path_sample_hits_hash{$path}};
        foreach my $sample ( @all_sample_names )
        {
            my $hits = 0;
            #if this sample has mutation
            if( grep /^$sample$/, @mutated_samples )
            {
                $hits = $path_sample_hits_hash{$path}{$sample}{hits};
            }
            push( @num_hits_per_sample, $hits );
        }
        #If this pathway doesn't have any mutated genes in any samples, skip it
        next unless( scalar( @num_hits_per_sample ) > 0 );
        my $hits_ref = \@num_hits_per_sample;
        ########### MCW ADDED
        # FIND MAX NUMBER OF HITS IN A SAMPLE
        my $max_hits = 0;
        foreach my $hits_in_sample ( @num_hits_per_sample )
        {
            $max_hits = $hits_in_sample if( $hits_in_sample > $max_hits );
        }
        ########### MCW ADDED
        my $pop_obj = Genome::Model::Tools::Music::PathScan::PopulationPathScan->new( \@gene_sizes );
        if( scalar( @gene_sizes ) >= 3 )
        {
            ########### MCW ADDED
            if( $max_hits > 15 )
            {
                $pop_obj->assign( 5 );
            }
            else
            {
                $pop_obj->assign( 3 );
            }
            ########### MCW ADDED
            #$pop_obj->assign(3);
        }
        elsif( @gene_sizes == 2 )
        {
            $pop_obj->assign( 2 );
        }
        else
        {
            $pop_obj->assign( 1 );
        }
        $pop_obj->preprocess( $bgd_mut_rate, $hits_ref );  #mwendl's new fix
        my $pval = $pop_obj->population_pval_approx($hits_ref);
        $data{$pval}{$path}{samples} = \@mutated_samples;
        $data{$pval}{$path}{hits} = $hits_ref;
        push( @pvals, $pval ); # For calculation of FDR
    }
    # Calculate False Discovery Rates (Benjamini-Hochberg FDR) for the p-values
    my $pval_cnt = scalar( @pvals );
    my %fdr_hash;
    for( my $i = 0; $i < $pval_cnt; $i++ )
    {
        my $fdr = $pvals[$i] * $pval_cnt / ( $pval_cnt - $i );
        $fdr = 1 if $fdr > 1;
        $fdr_hash{$pvals[$i]} = $fdr;
    }
    # Print two output files, one more detailed than the other
    my $out_fh = IO::File->new( $output_file, ">" );
    my $out_detailed_fh = IO::File->new( "$output_file\_detailed", ">" );
    $out_fh->print( "Pathway\tName\tClass\tSamples_Affected\tTotal_Variations\tp-value\tFDR\n" );
    foreach my $pval ( sort { $a <=> $b } keys %data )
    {
        foreach my $path ( sort keys %{$data{$pval}} )
        {
            # Skip this pathway if it has fewer affected genes than the user wants
            my %mutated_gene_hash;
            my @samples = @{$data{$pval}{$path}{samples}};
            foreach my $sample ( @samples )
            {
                foreach my $gene ( @{$path_sample_hits_hash{$path}{$sample}{mutated_genes}} )
                {
                    $mutated_gene_hash{$gene}++;
                }
            }
            next unless ( scalar( keys %mutated_gene_hash ) >= $min_mut_genes_per_path );
            # Print detailed output to a separate output file
            $out_detailed_fh->print( "Pathway: $path\n" );
            $out_detailed_fh->print( "Name: ", $path_hash{$path}{name}, "\n" ) if( defined $path_hash{$path}{name} );
            $out_detailed_fh->print( "Class: ", $path_hash{$path}{class}, "\n" ) if( defined $path_hash{$path}{class} );
            $out_detailed_fh->print( "Diseases: ", $path_hash{$path}{diseases}, "\n" ) if( defined $path_hash{$path}{diseases} );
            $out_detailed_fh->print( "Drugs: ", $path_hash{$path}{drugs}, "\n" ) if( defined $path_hash{$path}{drugs} );
            $out_detailed_fh->print( "P-value: $pval\n", "FDR: ", $fdr_hash{$pval}, "\n" );
            $out_detailed_fh->print( "Description: ", $path_hash{$path}{description}, "\n" );
            my @hits = @{$data{$pval}{$path}{hits}};
            foreach my $sample ( @samples )
            {
                my @mutated_genes = @{$path_sample_hits_hash{$path}{$sample}{mutated_genes}};
                $out_detailed_fh->print( "$sample:" );
                $out_detailed_fh->print( join ",", @mutated_genes );
                $out_detailed_fh->print( "\n" );
            }
            my ( $mutSampleCnt, $totalMutGenes ) = ( 0, 0 );
            $out_detailed_fh->print( "Samples with mutations (#hits): " );
            for( my $i = 0; $i < scalar( @all_sample_names ); ++$i )
            {
                if( $hits[$i] > 0 )
                {
                    $out_detailed_fh->print( "$all_sample_names[$i]($hits[$i]) " );
                    $mutSampleCnt++;
                    $totalMutGenes += $hits[$i];
                }
            }
            $out_detailed_fh->print( "\n\n" );
            # Print tabulated output to the main output file
            my ( $path_name, $path_class ) = ( "-", "-" );
            $path_name = $path_hash{$path}{name} if( defined $path_hash{$path}{name} );
            $path_class = $path_hash{$path}{class} if( defined $path_hash{$path}{class} );
            $out_fh->print( "$path\t$path_name\t$path_class\t$mutSampleCnt\t$totalMutGenes\t",
                "$pval\t", $fdr_hash{$pval}, "\n" );
        }
    }
    $out_detailed_fh->close;
    $out_fh->close;
    return 1;
}

# Reads files for each sample which are formatted as tab-separated lines each showing the number of # bases with sufficient coverage in a gene. sub read_CoverageFiles {
my ( $covg_dir, $all_samples_ref, $gene_sample_cov_hash_ref ) = ( $_[0], $_[1], $_[2] );

    # Read per-gene covered base counts for each sample
    foreach my $sample ( @{$all_samples_ref} )
    {
        # If the file doesn't exist, quit with error. The Music::Bmr::CalcCovg step is incomplete
        unless( -s "$covg_dir/$sample.covg" )
        {
            print STDERR "Couldn't find $sample.covg in $covg_dir. (music bmr calc-covg possibly incomplete)\n";
            exit 1;
        }
        my $covgFh = IO::File->new( "$covg_dir/$sample.covg" );
        while( my $line = $covgFh->getline )
        {
            next if( $line =~ m/^#/ );
            my ( $gene, undef, $covd_bases ) = split( /\t/, $line );
            $gene_sample_cov_hash_ref->{$gene}{$sample} = $covd_bases;
        }
        $covgFh->close;
    }
}

1;

2020-11-06 perl v5.30.3