Genome::Model::Tools::Music::Bmr::CalcBmr(3pm)

sub execute {
my $self = shift;
my $roi_file = $self->roi_file;
my $ref_seq = $self->reference_sequence;
my $bam_list = $self->bam_list;
my $output_dir = $self->output_dir;
my $maf_file = $self->maf_file;
my $show_skipped = $self->show_skipped;
my $bmr_groups = $self->bmr_groups;
my $separate_truncations = $self->separate_truncations;
my $merge_concurrent_muts = $self->merge_concurrent_muts;
my $genes_to_ignore = $self->genes_to_ignore;
my $skip_non_coding = $self->skip_non_coding;
my $skip_silent = $self->skip_silent;

    # Check on all the input data before starting work
    print STDERR "ROI file not found or is empty: $roi_file\n" unless( -s $roi_file );
    print STDERR "Reference sequence file not found: $ref_seq\n" unless( -e $ref_seq );
    print STDERR "List of BAMs not found or is empty: $bam_list\n" unless( -s $bam_list );
    print STDERR "Output directory not found: $output_dir\n" unless( -e $output_dir );
    print STDERR "MAF file not found or is empty: $maf_file\n" unless( -s $maf_file );
    return undef unless( -s $roi_file && -e $ref_seq && -s $bam_list && -e $output_dir && -s $maf_file );
    # Check on the files we expect to find within the provided output directory
    $output_dir =~ s/(\/)+$//; # Remove trailing forward slashes if any
    my $gene_covg_dir = "$output_dir/gene_covgs"; # Should contain per-gene coverage files per sample
    my $total_covgs_file = "$output_dir/total_covgs"; # Should contain overall coverages per sample
    print STDERR "Directory with per-gene coverages not found: $gene_covg_dir\n" unless( -e $gene_covg_dir );
    print STDERR "Total coverages file not found or is empty: $total_covgs_file\n" unless( -s $total_covgs_file );
    return undef unless( -e $gene_covg_dir && -s $total_covgs_file );
    # Outputs of this script will be written to these locations in the output directory
    my $overall_bmr_file = "$output_dir/overall_bmrs";
    my $gene_mr_file = "$output_dir/gene_mrs";
    $self->gene_mr_file( $gene_mr_file );
    # Build a hash to quickly lookup the genes to be ignored for overall BMRs
    my %ignored_genes = ();
    if( defined $genes_to_ignore ) {
        %ignored_genes = map { $_ => 1 } split( /,/, $genes_to_ignore );
    }
    # Parse out the names of the samples which should match the names of the coverage files needed
    my ( @all_sample_names, %sample_idx );
    my $idx = 0;
    my $sampleFh = IO::File->new( $bam_list ) or die "Couldn't open $bam_list. $!";
    while( my $line = $sampleFh->getline ) {
        next if ( $line =~ m/^#/ );
        chomp( $line );
        my ( $sample ) = split( /\t/, $line );
        push( @all_sample_names, $sample );
        $sample_idx{$sample} = $idx++;
    }
    $sampleFh->close;
    # If the reference sequence FASTA file hasn't been indexed, do it
    my $ref_seq_idx = "$ref_seq.fai";
    Genome::Sys->shellcmd( cmd => "samtools faidx $ref_seq" ) unless( -e $ref_seq_idx );
    # Parse gene names and ROIs. Mutations outside these ROIs will be skipped
    my ( @all_gene_names, %gene_idx );
    $idx = 0;
    my $roi_bitmask = $self->create_empty_genome_bitmask( $ref_seq_idx );
    my $roiFh = IO::File->new( $roi_file ) or die "Couldn't open $roi_file. $!";
    while( my $line = $roiFh->getline ) {
        next if( $line =~ m/^#/ );
        chomp $line;
        my ( $chr, $start, $stop, $gene ) = split( /\t/, $line );
        if( !$roi_bitmask->{$chr} or $start > $roi_bitmask->{$chr}->Size ) {
            print STDERR "Skipping invalid ROI bitmask $chr:$start-$stop\n";
            next;
        }
        $roi_bitmask->{$chr}->Interval_Fill( $start, $stop );
        unless( defined $gene_idx{$gene} ) {
            push( @all_gene_names, $gene );
            $gene_idx{$gene} = $idx++;
        }
    }
    $roiFh->close;
    # These are the various categories that each mutation will be classified into
    my @mut_classes = ( AT_Transitions, AT_Transversions, CG_Transitions, CG_Transversions, CpG_Transitions, CpG_Transversions, Indels );
    push( @mut_classes, Truncations ) if( $separate_truncations );
    # Save the actual class names for reporting purposes, because the elements above are really just numerical constants
    my @mut_class_names = qw( AT_Transitions AT_Transversions CG_Transitions CG_Transversions CpG_Transitions CpG_Transversions Indels );
    push( @mut_class_names, 'Truncations' ) if( $separate_truncations );
    my @sample_mr; # Stores per sample covg and mutation information
    foreach my $sample ( @all_sample_names ) {
        $sample_mr[$sample_idx{$sample}][$_][mutations] = 0 foreach( @mut_classes );
        $sample_mr[$sample_idx{$sample}][$_][covd_bases] = 0 foreach( @mut_classes );
    }
    # Load the covered base-counts per sample from the output of "music bmr calc-covg"
    print STDERR "Loading per-sample coverages stored in $total_covgs_file\n";
    my $sample_cnt_in_total_covgs_file = 0;
    my $totCovgFh = IO::File->new( $total_covgs_file ) or die "Couldn't open $total_covgs_file. $!";
    while( my $line = $totCovgFh->getline ) {
        next unless( $line =~ m/^\S+\t\d+\t\d+\t\d+\t\d+$/ and $line !~ m/^#/ );
        chomp( $line );
        ++$sample_cnt_in_total_covgs_file;
        my ( $sample, $covd_bases, $covd_at_bases, $covd_cg_bases, $covd_cpg_bases ) = split( /\t/, $line );
        $sample_mr[$sample_idx{$sample}][AT_Transitions][covd_bases] = $covd_at_bases;
        $sample_mr[$sample_idx{$sample}][AT_Transversions][covd_bases] = $covd_at_bases;
        $sample_mr[$sample_idx{$sample}][CG_Transitions][covd_bases] = $covd_cg_bases;
        $sample_mr[$sample_idx{$sample}][CG_Transversions][covd_bases] = $covd_cg_bases;
        $sample_mr[$sample_idx{$sample}][CpG_Transitions][covd_bases] = $covd_cpg_bases;
        $sample_mr[$sample_idx{$sample}][CpG_Transversions][covd_bases] = $covd_cpg_bases;
        $sample_mr[$sample_idx{$sample}][Indels][covd_bases] = $covd_bases;
        $sample_mr[$sample_idx{$sample}][Truncations][covd_bases] = $covd_bases if( $separate_truncations );
    }
    $totCovgFh->close;
    unless( $sample_cnt_in_total_covgs_file == scalar( @all_sample_names )) {
        print STDERR "Mismatching number of samples in $total_covgs_file and $bam_list\n";
        return undef;
    }
    my @gene_mr; # Stores per gene covg and mutation information
    foreach my $gene ( @all_gene_names ) {
        foreach my $sample ( @all_sample_names ) {
            $gene_mr[$sample_idx{$sample}][$gene_idx{$gene}][$_][mutations] = 0 foreach( @mut_classes );
            $gene_mr[$sample_idx{$sample}][$gene_idx{$gene}][$_][covd_bases] = 0 foreach( @mut_classes );
        }
    }
    # Sum up the per-gene covered base-counts across samples from the output of "music bmr calc-covg"
    print STDERR "Loading per-gene coverage files stored under $gene_covg_dir/\n";
    foreach my $sample ( @all_sample_names ) {
        my $sample_covg_file = "$gene_covg_dir/$sample.covg";
        my $sampleCovgFh = IO::File->new( $sample_covg_file ) or die "Couldn't open $sample_covg_file. $!";
        while( my $line = $sampleCovgFh->getline ) {
            next unless( $line =~ m/^\S+\t\d+\t\d+\t\d+\t\d+\t\d+$/ and $line !~ m/^#/ );
            chomp( $line );
            my ( $gene, undef, $covd_bases, $covd_at_bases, $covd_cg_bases, $covd_cpg_bases ) = split( /\t/, $line );
            $gene_mr[$sample_idx{$sample}][$gene_idx{$gene}][AT_Transitions][covd_bases] += $covd_at_bases;
            $gene_mr[$sample_idx{$sample}][$gene_idx{$gene}][AT_Transversions][covd_bases] += $covd_at_bases;
            $gene_mr[$sample_idx{$sample}][$gene_idx{$gene}][CG_Transitions][covd_bases] += $covd_cg_bases;
            $gene_mr[$sample_idx{$sample}][$gene_idx{$gene}][CG_Transversions][covd_bases] += $covd_cg_bases;
            $gene_mr[$sample_idx{$sample}][$gene_idx{$gene}][CpG_Transitions][covd_bases] += $covd_cpg_bases;
            $gene_mr[$sample_idx{$sample}][$gene_idx{$gene}][CpG_Transversions][covd_bases] += $covd_cpg_bases;
            $gene_mr[$sample_idx{$sample}][$gene_idx{$gene}][Indels][covd_bases] += $covd_bases;
            $gene_mr[$sample_idx{$sample}][$gene_idx{$gene}][Truncations][covd_bases] += $covd_bases if( $separate_truncations );
        }
        $sampleCovgFh->close;
    }
    # Run "joinx ref-stats" to classify SNVs as being at AT, CG, or CpG sites in the reference
    print STDERR "Running 'joinx ref-stats' to read reference FASTA and identify SNVs at AT, CG, CpG sites\n";
    my $maf_bed = Genome::Sys->create_temp_file_path();
    my $mafBedFh = IO::File->new( $maf_bed, ">" ) or die "Temporary file could not be created. $!";
    my $mafFh = IO::File->new( $maf_file ) or die "Couldn't open $maf_file. $!";
    while( my $line = $mafFh->getline ) {
        next if( $line =~ m/^(#|Hugo_Symbol)/ );
        chomp $line;
        my @cols = split( /\t/, $line );
        my ( $chr, $start, $stop, $mutation_type, $ref, $var1, $var2 ) = @cols[4..6,9..12];
        if( $mutation_type =~ m/^(SNP|DNP|ONP|TNP)$/ ) {
            $mafBedFh->print( "$chr\t" . ( $start - 2 ) . "\t" . ( $start + 1 ) . "\n" );
        }
    }
    $mafFh->close;
    $mafBedFh->close;
    my $refstats_file = Genome::Sys->create_temp_file_path();
    Genome::Sys->shellcmd( cmd => "joinx ref-stats --ref-bases --bed $maf_bed --fasta $ref_seq --output $refstats_file" );
    # Parse through the ref-stats output and load it into hashes for quick lookup later
    my ( %ref_base, %cpg_site );
    my $refStatsFh = IO::File->new( $refstats_file ) or die "Couldn't open $refstats_file. $!";
    while( my $line = $refStatsFh->getline ) {
        next if( $line =~ m/^#/ );
        chomp $line;
        my ( $chr, undef, $pos, undef, undef, undef, $ref ) = split( /\t/, $line );
        my $locus = "$chr\t" . ( $pos - 1 );
        $ref_base{$locus} = substr( $ref, 1, 1 );
        $cpg_site{$locus} = 1 if( $ref =~ m/CG/ );
    }
    $refStatsFh->close;
    # Create a hash to help classify SNVs
    my %classify;
    $classify{$_} = AT_Transitions foreach( qw( AG TC ));
    $classify{$_} = AT_Transversions foreach( qw( AC AT TA TG ));
    $classify{$_} = CG_Transitions foreach( qw( CT GA ));
    $classify{$_} = CG_Transversions foreach( qw( CA CG GC GT ));
    # Parse through the MAF file and categorize each somatic mutation
    print STDERR "Parsing MAF file to classify mutations\n";
    my %skip_cnts;
    $mafFh = IO::File->new( $maf_file ) or die "Couldn't open $maf_file. $!";
    while( my $line = $mafFh->getline ) {
        next if( $line =~ m/^(#|Hugo_Symbol)/ );
        chomp $line;
        my @cols = split( /\t/, $line );
        my ( $gene, $chr, $start, $stop, $mutation_class, $mutation_type, $ref, $var1, $var2, $sample ) = @cols[0,4..6,8..12,15];
        # Skip mutations in samples that are not in the provided bam list
        unless( defined $sample_idx{$sample} ) {
            $skip_cnts{"belong to unrecognized samples"}++;
            print STDERR "Skipping unrecognized sample ($sample not in BAM list): $gene, $chr:$start-$stop\n" if( $show_skipped );
            next;
        }
        # If the mutation classification is odd, quit with error
        if( $mutation_class !~ m/^(Missense_Mutation|Nonsense_Mutation|Nonstop_Mutation|Splice_Site|Translation_Start_Site|Frame_Shift_Del|Frame_Shift_Ins|In_Frame_Del|In_Frame_Ins|Silent|Intron|RNA|3'Flank|3'UTR|5'Flank|5'UTR|IGR|Targeted_Region|De_novo_Start_InFrame|De_novo_Start_OutOfFrame)$/ ) {
            print STDERR "Unrecognized Variant_Classification \"$mutation_class\" in MAF file: $gene, $chr:$start-$stop\n";
            print STDERR "Please use TCGA MAF Specification v2.3.\n";
            return undef;
        }
        # If user wants, skip Silent mutations, or those in Introns, RNA, UTRs, Flanks, IGRs, or the ubiquitous Targeted_Region
        if(( $skip_non_coding && $mutation_class =~ m/^(Intron|RNA|3'Flank|3'UTR|5'Flank|5'UTR|IGR|Targeted_Region)$/ ) ||
          ( $skip_silent && $mutation_class =~ m/^Silent$/ )) {
            $skip_cnts{"are classified as $mutation_class"}++;
            print STDERR "Skipping $mutation_class mutation: $gene, $chr:$start-$stop\n" if( $show_skipped );
            next;
        }
        # If the mutation type is odd, quit with error
        if( $mutation_type !~ m/^(SNP|DNP|TNP|ONP|INS|DEL|Consolidated)$/ ) {
            print STDERR "Unrecognized Variant_Type \"$mutation_type\" in MAF file: $gene, $chr:$start-$stop\n";
            print STDERR "Please use TCGA MAF Specification v2.3.\n";
            return undef;
        }
        # Skip mutations that were consolidated into others (E.g. SNP consolidated into a TNP)
        if( $mutation_type =~ m/^Consolidated$/ ) {
            $skip_cnts{"are consolidated into another"}++;
            print STDERR "Skipping consolidated mutation: $gene, $chr:$start-$stop\n" if( $show_skipped );
            next;
        }
        # Skip mutations that fall completely outside any of the provided regions of interest
        if( $self->count_bits( $roi_bitmask->{$chr}, $start, $stop ) == 0 ) {
            $skip_cnts{"are outside any ROIs"}++;
            print STDERR "Skipping mutation that falls outside ROIs: $gene, $chr:$start-$stop\n" if( $show_skipped );
            next;
        }
        # Skip mutations whose gene names don't match any of those in the ROI list
        unless( defined $gene_idx{$gene} ) {
            $skip_cnts{"have unrecognized gene names"}++;
            print STDERR "Skipping unrecognized gene name (not in ROI file): $gene, $chr:$start-$stop\n" if( $show_skipped );
            next;
        }
        my $class = '';
        # Check if the mutation is the truncating type, if the user wanted a separate category of those
        if( $separate_truncations && $mutation_class =~ m/^(Nonsense_Mutation|Splice_Site|Frame_Shift_Del|Frame_Shift_Ins)/ ) {
            $class = Truncations;
        }
        # Classify the mutation as AT/CG/CpG Transition, AT/CG/CpG Transversion
        elsif( $mutation_type =~ m/^(SNP|DNP|ONP|TNP)$/ ) {
            # ::TBD:: For DNPs and TNPs, we use only the first base for mutation classification
            $ref = substr( $ref, 0, 1 );
            $var1 = substr( $var1, 0, 1 );
            $var2 = substr( $var2, 0, 1 );
            # If the alleles are anything but A, C, G, or T then quit with error
            if( $ref !~ m/[ACGT]/ || $var1 !~ m/[ACGT]/ || $var2 !~ m/[ACGT]/ ) {
                print STDERR "Unrecognized allele in column Reference_Allele, Tumor_Seq_Allele1, or Tumor_Seq_Allele2: $gene, $chr:$start-$stop\n";
                print STDERR "Please use TCGA MAF Specification v2.3.\n";
                return undef;
            }
            # Use the classify hash to find whether this SNV is an AT/CG Transition/Transversion
            $class = $classify{ "$ref$var1" } if( defined $classify{ "$ref$var1" } );
            $class = $classify{ "$ref$var2" } if( defined $classify{ "$ref$var2" } );
            # Check if the ref base in the MAF matched what we fetched from the ref-seq
            my $locus = "$chr\t$start";
            if( defined $ref_base{$locus} && $ref_base{$locus} ne $ref ) {
                print STDERR "Reference allele $ref for $gene variant at $chr:$start-$stop is " . $ref_base{$locus} . " in the FASTA. Using it anyway.\n";
            }
            # Check if a C or G reference allele belongs to a CpG pair in the refseq
            if(( $ref eq 'C' || $ref eq 'G' ) && defined $cpg_site{$locus} ) {
                $class = (( $class == CG_Transitions ) ? CpG_Transitions : CpG_Transversions );
            }
        }
        # Classify it as an indel (excludes splice-site and frame-shift if user wanted truncations separately)
        elsif( $mutation_type =~ m/^(INS|DEL)$/ ) {
            $class = Indels;
        }
        # The user's gene exclusion list affects only the overall BMR calculations
        $sample_mr[$sample_idx{$sample}][$class][mutations]++ unless( defined $ignored_genes{$gene} );
        $gene_mr[$sample_idx{$sample}][$gene_idx{$gene}][$class][mutations]++;
    }
    $mafFh->close;
    # Display statistics related to parsing the MAF
    print STDERR "Finished Parsing the MAF file to classify mutations\n";
    foreach my $skip_type ( sort {$skip_cnts{$b} <=> $skip_cnts{$a}} keys %skip_cnts ) {
        print STDERR "Skipped " . $skip_cnts{$skip_type} . " mutation(s) that $skip_type\n" if( defined $skip_cnts{$skip_type} );
    }
    # If the user wants, merge together concurrent mutations of a gene in the same sample
    if( $merge_concurrent_muts ) {
        foreach my $sample ( @all_sample_names ) {
            foreach my $gene ( @all_gene_names ) {
                next unless( defined $gene_mr[$sample_idx{$sample}][$gene_idx{$gene}] );
                my $num_muts = 0;
                $num_muts += $gene_mr[$sample_idx{$sample}][$gene_idx{$gene}][$_][mutations] foreach( @mut_classes );
                if( $num_muts > 1 ) {
                    foreach my $class ( @mut_classes ) {
                        my $muts_in_class = $gene_mr[$sample_idx{$sample}][$gene_idx{$gene}][$class][mutations]; # Num of muts of gene in this class
                        $sample_mr[$sample_idx{$sample}][$class][mutations] -= $muts_in_class; # Take it out of the sample total
                        $muts_in_class /= $num_muts; # Turn it into a fraction of the total number of muts in this gene
                        $gene_mr[$sample_idx{$sample}][$gene_idx{$gene}][$class][mutations] = $muts_in_class; # Use the fraction as the num muts of gene in this class
                        $sample_mr[$sample_idx{$sample}][$class][mutations] += $muts_in_class; # Add the same fraction to the sample total
                    }
                }
            }
        }
    }
    # Calculate per-sample BMRs, and also subtract out covered bases in genes the user wants ignored
    foreach my $sample ( @all_sample_names ) {
        my $tot_muts = 0;
        foreach my $class ( @mut_classes ) {
            # Subtract the covered bases in this class that belong to the genes to be ignored
            # ::TBD:: Some of these bases may also belong to another gene (on the other strand maybe?), and those should not be subtracted
            foreach my $ignored_gene ( keys %ignored_genes ) {
                $sample_mr[$sample_idx{$sample}][$class][covd_bases] -= $gene_mr[$sample_idx{$sample}][$gene_idx{$ignored_gene}][$class][covd_bases] if( defined $gene_mr[$sample_idx{$sample}][$gene_idx{$ignored_gene}] );
            }
            $tot_muts += $sample_mr[$sample_idx{$sample}][$class][mutations];
        }
        $sample_mr[$sample_idx{$sample}][Overall][bmr] = $tot_muts / $sample_mr[$sample_idx{$sample}][Indels][covd_bases];
    }
    # Cluster samples into bmr-groups using k-means clustering
    my @sample_bmrs = map { $sample_mr[$sample_idx{$_}][Overall][bmr] } @all_sample_names;
    my @bmr_clusters = k_means( $bmr_groups, \@sample_bmrs );
    # Calculate overall BMRs for each cluster of samples, and print them to file
    my %cluster_bmr; # Stores per cluster categorized BMR
    my $totBmrFh = IO::File->new( $overall_bmr_file, ">" ) or die "Couldn't open $overall_bmr_file. $!";
    $totBmrFh->print( "#User-specified genes skipped in these calculations: $genes_to_ignore\n" ) if( defined $genes_to_ignore );
    my ( $covered_bases_sum, $mutations_sum ) = ( 0, 0 );
    for( my $i = 0; $i < scalar( @bmr_clusters ); ++$i ) {
        my @samples_in_cluster = map { $all_sample_names[$_] } @{$bmr_clusters[$i]};
        unless( $bmr_groups == 1 ) {
            $totBmrFh->print( "#BMR sub-group ", $i + 1, " (", scalar( @{$bmr_clusters[$i]} ), " samples)\n" );
            $totBmrFh->print( "#Samples: ", join( ",", @samples_in_cluster ), "\n" );
        }
        $totBmrFh->print( "#Mutation_Class\tCovered_Bases\tMutations\tOverall_BMR\n" );
        my ( $tot_covd_bases, $tot_muts ) = ( 0, 0 );
        foreach my $class ( @mut_classes ) {
            my ( $covd_bases, $mutations ) = ( 0, 0 );
            foreach my $sample ( @samples_in_cluster ) {
                $covd_bases += $sample_mr[$sample_idx{$sample}][$class][covd_bases];
                $mutations += $sample_mr[$sample_idx{$sample}][$class][mutations];
            }
            $tot_covd_bases = $covd_bases if( $class == Indels ); # Save this to calculate overall BMR below
            # Calculate overall BMR for this mutation class and print it to file
            $cluster_bmr{$i}[$class][bmr] = ( $covd_bases == 0 ? 0 : ( $mutations / $covd_bases ));
            $totBmrFh->print( join( "\t", $mut_class_names[$class], $covd_bases, $mutations, $cluster_bmr{$i}[$class][bmr] ), "\n" );
            $tot_muts += $mutations;
        }
        $totBmrFh->print( join( "\t", "Overall_BMR", $tot_covd_bases, $tot_muts, $tot_muts / $tot_covd_bases ), "\n\n" );
        $covered_bases_sum += $tot_covd_bases;
        $mutations_sum += $tot_muts;
    }
    $totBmrFh->close;
    $self->bmr_output( $mutations_sum / $covered_bases_sum );
    # Print out a file containing per-gene mutation counts and covered bases for use by "music smg"
    my $geneBmrFh = IO::File->new( $gene_mr_file, ">" ) or die "Couldn't open $gene_mr_file. $!";
    $geneBmrFh->print( "#Gene\tMutation_Class\tCovered_Bases\tMutations\tBMR\n" );
    foreach my $gene ( sort @all_gene_names ) {
        my ( $tot_covd_bases, $tot_muts ) = ( 0, 0 );
        for( my $i = 0; $i < scalar( @bmr_clusters ); ++$i ) {
            my @samples_in_cluster = map { $all_sample_names[$_] } @{$bmr_clusters[$i]};
            foreach my $class ( @mut_classes ) {
                my ( $covd_bases, $mutations ) = ( 0, 0 );
                foreach my $sample( @samples_in_cluster ) {
                    if( defined $gene_mr[$sample_idx{$sample}][$gene_idx{$gene}] ) {
                        $covd_bases += $gene_mr[$sample_idx{$sample}][$gene_idx{$gene}][$class][covd_bases];
                        $mutations += $gene_mr[$sample_idx{$sample}][$gene_idx{$gene}][$class][mutations];
                    }
                }
                my $rename_class = $mut_class_names[$class];
                $rename_class = ( $rename_class . "_SubGroup" . ( $i + 1 )) if( $bmr_groups > 1 );
                $geneBmrFh->print( join( "\t", $gene, $rename_class, $covd_bases, $mutations, $cluster_bmr{$i}[$class][bmr] ), "\n" );
                $tot_muts += $mutations;
                $tot_covd_bases += $covd_bases if( $class == Indels );
            }
        }
        $geneBmrFh->print( join( "\t", $gene, "Overall", $tot_covd_bases, $tot_muts, $self->bmr_output ), "\n" );
    }
    $geneBmrFh->close;
    return 1;
}

# Creates an empty whole genome bitmask based on the given reference sequence index sub create_empty_genome_bitmask {
my ( $self, $ref_seq_idx_file ) = @_;
my %genome;
my $refFh = IO::File->new( $ref_seq_idx_file ) or die "Couldn't open $ref_seq_idx_file. $!";
while( my $line = $refFh->getline ) {
my ( $chr, $length ) = split( /\t/, $line );
$genome{$chr} = Bit::Vector->new( $length + 1 ); # Adding a base for 1-based coordinates
}
$refFh->close;
return \%genome; }

# Counts the number of bits that are set in the given region of a Bit:Vector sub count_bits {
my ( $self, $vector, $start, $stop ) = @_;
my $count = 0;
for my $pos ( $start..$stop ) {
++$count if( $vector->bit_test( $pos ));
}
return $count; }

# Given a list of numerical values, returns k clusters based on k-means clustering sub k_means {
my ( $k, $list_ref ) = @_;
my @vals = @{$list_ref};
my $num_vals = scalar( @vals );

    # Start with the first k values as the centroids
    my @centroids = @vals[0..($k-1)];
    my @prev_centroids = map { 0 } @centroids;
    my @groups = ();
    my $diff_means = 1; # Arbitrary non-zero value
    # Repeat until the difference between these centroids and the previous ones, converges to zero
    while( $diff_means > 0 ) {
        @groups = ();
        # Group values into clusters based on closest centroid
        for( my $i = 0; $i < $num_vals; ++$i ) {
            my @distances = map { abs( $vals[$i] - $_ ) } @centroids;
            my $closest = min( @distances );
            for( my $j = 0; $j < $k; ++$j ) {
                if( $distances[$j] == $closest ) { push( @{$groups[$j]}, $i ); last; }
            }
        }
        # Calculate means to be the new centroids, and the sum of differences
        $diff_means = 0;
        for( my $i = 0; $i < $k; ++$i ) {
            $centroids[$i] = sum( map {$vals[$_]} @{$groups[$i]} );
            $centroids[$i] /= scalar( @{$groups[$i]} );
            $diff_means += abs( $centroids[$i] - $prev_centroids[$i] );
        }
        # Save the current centroids for comparisons with those in the next iteration
        @prev_centroids = @centroids;
    }
    return @groups;
}

2020-11-06

perl v5.30.3