DOKK / manpages / debian 12 / libgenome-model-tools-music-perl / Genome::Model::Tools::Music::Bmr::CalcCovg.3pm.en
Genome::Model::Tools::Music::Bmr::CalcCovg(3pm) User Contributed Perl Documentation Genome::Model::Tools::Music::Bmr::CalcCovg(3pm)

  • Define cmd-list-file and cmd-prefix to generate a file with commands that can be submitted to a cluster or run manually. These jobs will write per-ROI base counts in a subdirectory roi_covgs.
  • After all the parallelized calcRoiCovg jobs are completed, run this script again to add them up and generate the final per-gene base counts in a subdirectory gene_covgs. Remember to remove the cmd-list-file and cmd-prefix arguments or you will just be re-creating a list of commands.

HELP }

sub _additional_help_sections {
return (
"ARGUMENTS", <<EOS

EOS
); }

sub _doc_authors {
return " Cyriac Kandoth, Ph.D."; }

sub _doc_see_also {
return <<EOS genome-music-bmr(1), genome-music(1), genome(1) EOS }

sub execute {
my $self = shift;
my $roi_file = $self->roi_file;
my $ref_seq = $self->reference_sequence;
my $bam_list = $self->bam_list;
my $output_dir = $self->output_dir;
my $cmd_list_file = $self->cmd_list_file;
my $cmd_prefix = $self->cmd_prefix;
my $normal_min_depth = $self->normal_min_depth;
my $tumor_min_depth = $self->tumor_min_depth;
my $min_mapq = $self->min_mapq;

  my $optional_params = "";
  if ($normal_min_depth) {
    $optional_params .= " --normal-min-depth $normal_min_depth";
  }
  if ($tumor_min_depth) {
    $optional_params .= " --tumor-min-depth $tumor_min_depth";
  }
  if ($min_mapq) {
    $optional_params .= " --min-mapq $min_mapq";
  }
  # Check on all the input data before starting work
  print STDERR "ROI file not found or is empty: $roi_file\n" unless( -s $roi_file );
  print STDERR "Reference sequence file not found: $ref_seq\n" unless( -e $ref_seq );
  print STDERR "List of BAMs not found or is empty: $bam_list\n" unless( -s $bam_list );
  print STDERR "Output directory not found: $output_dir\n" unless( -e $output_dir );
  return undef unless( -s $roi_file && -e $ref_seq && -s $bam_list && -e $output_dir );
  # Outputs of this script will be written to these locations in the output directory
  $output_dir =~ s/(\/)+$//; # Remove trailing forward slashes if any
  my $roi_covg_dir = "$output_dir/roi_covgs"; # Stores output from calcRoiCovg per sample
  my $gene_covg_dir = "$output_dir/gene_covgs"; # Stores per-gene coverages per sample
  my $tot_covg_file = "$output_dir/total_covgs"; # Stores total coverages per sample
  $self->gene_covg_dir($gene_covg_dir);
  # Check whether the annotated regions of interest are clumped together by chromosome
  my $roiFh = IO::File->new( $roi_file ) or die "ROI file could not be opened. $!\n";
  my @chroms = ( "" );
  while( my $line = $roiFh->getline ) # Emulate Unix's uniq command on the chromosome column
  {
    my ( $chrom ) = ( $line =~ m/^(\S+)/ );
    push( @chroms, $chrom ) if( $chrom ne $chroms[-1] );
  }
  $roiFh->close;
  my %chroms = map { $_ => 1 } @chroms; # Get the actual number of unique chromosomes
  if( scalar( @chroms ) != scalar( keys %chroms ))
  {
    print STDERR "ROIs from the same chromosome must be listed adjacent to each other in file. ";
    print STDERR "If in UNIX, try:\nsort -k 1,1 $roi_file\n";
    return undef;
  }
  # If the reference sequence FASTA file hasn't been indexed, do it
  my $ref_seq_idx = "$ref_seq.fai";
  system( "samtools faidx $ref_seq" ) unless( -e $ref_seq_idx );
  # Create the output directories unless they already exist
  mkdir $roi_covg_dir unless( -e $roi_covg_dir );
  mkdir $gene_covg_dir unless( -e $gene_covg_dir );
  my ( $cmdFh, $totCovgFh );
  if( defined $cmd_list_file )
  {
    $cmdFh = IO::File->new( $cmd_list_file, ">" );
    print "Creating a list of parallelizable jobs at $cmd_list_file.\n";
    print "After successfully running all the jobs in $cmd_list_file,\n",
          "be sure to run this script a second time (without defining the cmd-list-file argument) to merge results in roi_covgs.\n";
  }
  else
  {
    $totCovgFh = IO::File->new( $tot_covg_file, ">" );
    $totCovgFh->print( "#Sample\tCovered_Bases\tAT_Bases_Covered\tCG_Bases_Covered\tCpG_Bases_Covered\n" );
  }
  # Parse through each pair of BAM files provided and run calcRoiCovg as necessary
  my $bamFh = IO::File->new( $bam_list );
  while( my $line = $bamFh->getline )
  {
    next if( $line =~ m/^#/ );
    chomp( $line );
    my ( $sample, $normal_bam, $tumor_bam ) = split( /\t/, $line );
    $normal_bam = '' unless( defined $normal_bam );
    $tumor_bam = '' unless( defined $tumor_bam );
    print STDERR "Normal BAM for $sample not found: \"$normal_bam\"\n" unless( -e $normal_bam );
    print STDERR "Tumor BAM for $sample not found: \"$tumor_bam\"\n" unless( -e $tumor_bam );
    next unless( -e $normal_bam && -e $tumor_bam );
    # Construct the command that calculates coverage per ROI
    my $calcRoiCovg_cmd = "\'gmt music bmr calc-covg-helper --normal-tumor-bam-pair \"$line\" --roi-file \"$roi_file\" ".
    "--reference-sequence \"$ref_seq\" --output-file \"$roi_covg_dir/$sample.covg\"$optional_params\'";
    # If user only wants the calcRoiCovg commands, write them to file and skip running calcRoiCovg
    if( defined $cmd_list_file )
    {
      $calcRoiCovg_cmd = $cmd_prefix . " $calcRoiCovg_cmd" if( defined $cmd_prefix );
      $cmdFh->print( "$calcRoiCovg_cmd\n" );
      next;
    }
    # If the calcRoiCovg output was already generated, then don't rerun it
    if( -s "$roi_covg_dir/$sample.covg" )
    {
      print "$sample.covg found in $roi_covg_dir. Skipping re-calculation.\n";
    }
    # Run the calcRoiCovg command on this tumor-normal pair. This could take a while
    else {
      my %params = (
        normal_tumor_bam_pair => $line,
        roi_file => $roi_file,
        reference_sequence => $ref_seq, 
        output_file => $roi_covg_dir."/".$sample.".covg",
      );
      if ($normal_min_depth) {
        $params{"normal_min_depth"} = $normal_min_depth;
      }
      if ($tumor_min_depth) {
        $params{"tumor_min_depth"} = $tumor_min_depth;
      }
      if ($min_mapq) {
        $params{"min_mapq"} = $min_mapq;
      }
      my $cmd = Genome::Model::Tools::Music::Bmr::CalcCovgHelper->create(%params);
      my $rv = $cmd->execute;
      if(!$rv)
      {
        print STDERR "Failed to execute: $calcRoiCovg_cmd\n";
        next;
      }
      else
      {
        print "$sample.covg generated and stored to $roi_covg_dir.\n";
      }
    }
    # Read the calcRoiCovg output and count covered bases per gene
    my %geneCovg = ();
    my ( $tot_covd, $tot_at_covd, $tot_cg_covg, $tot_cpg_covd );
    my $roiCovgFh = IO::File->new( "$roi_covg_dir/$sample.covg" );
    while( my $line = $roiCovgFh->getline )
    {
      chomp( $line );
      if( $line =~ m/^#NonOverlappingTotals/ )
      {
        ( undef, undef, undef, $tot_covd, $tot_at_covd, $tot_cg_covg, $tot_cpg_covd ) = split( /\t/, $line );
      }
      elsif( $line !~ m/^#/ )
      {
        my ( $gene, undef, $length, $covd, $at_covd, $cg_covd, $cpg_covd ) = split( /\t/, $line );
        $geneCovg{$gene}{len} += $length;
        $geneCovg{$gene}{covd_len} += $covd;
        $geneCovg{$gene}{at} += $at_covd;
        $geneCovg{$gene}{cg} += $cg_covd;
        $geneCovg{$gene}{cpg} += $cpg_covd;
      }
    }
    $roiCovgFh->close;
    # Write the per-gene coverages to a file named after this sample_name
    my $geneCovgFh = IO::File->new( "$gene_covg_dir/$sample.covg", ">" );
    $geneCovgFh->print( "#Gene\tLength\tCovered\tAT_covd\tCG_covd\tCpG_covd\n" );
    foreach my $gene ( sort keys %geneCovg )
    {
      $geneCovgFh->print( join( "\t", $gene, $geneCovg{$gene}{len}, $geneCovg{$gene}{covd_len},
         $geneCovg{$gene}{at}, $geneCovg{$gene}{cg}, $geneCovg{$gene}{cpg} ), "\n" );
    }
    $geneCovgFh->close;
    # Write total coverages for this sample to a file
    $totCovgFh->print( "$sample\t$tot_covd\t$tot_at_covd\t$tot_cg_covg\t$tot_cpg_covd\n" );
  }
  $bamFh->close;
  $cmdFh->close if( defined $cmd_list_file );
  $totCovgFh->close unless( defined $cmd_list_file );
  return 1;
}

1;

2020-11-06 perl v5.30.3