#!/usr/bin/perl if(@ARGV < 1) { print "Usage: $0 output_dir query_limit_flag intron_limit_flag\n"; exit; } use strict; use Class::Struct; # This module passes in the original sizes of exons and # introns of each refseq gene so that pseudogenes can be # classified as either processed or non-processed use refseq_exon_intron_sizes; struct Node => { start => '$', # Genomic start of hit end => '$', # Genomic end of hit qstart => '$', # Start of query qend => '$', # End of query strand => '$', # Direction of hit (1/-1) refseq => '$', # Refseq of hit chrom => '$', # Chrom of hit exon => '$', # Exon of hit queryLength => '$', # Original exon length Length => '$', # Length of sequence in hit minus gaps coverage => '$', # "Hit size / Query size" percentIdentity => '$', # Percent Identity of hit explored => '$', # A path was printed with this node (T/F) nextNode => '$' # Pointer to next node in path }; my $pgeneID=1; my $NON_OVERLAP=10; my $INTRON_LIMIT=19417; my $OUTPUT_DIR=$ARGV[0]; my $QUERY_LIMIT_FLAG=$ARGV[1]; my $INTRON_LIMIT_FLAG=$ARGV[2]; open SING_INTRONS, ">$OUTPUT_DIR/sing_introns.txt" || die "Cannot open $OUTPUT_DIR/sing_introns.txt for writing.\n"; open MULTI_INTRONS, ">$OUTPUT_DIR/multi_introns.txt" || die "Cannot open $OUTPUT_DIR/multi_introns.txt for writing.\n"; open PATHS, ">$OUTPUT_DIR/paths.txt" || die "Cannot open $OUTPUT_DIR/paths.txt for writing.\n"; open OPATHS, ">$OUTPUT_DIR/overlapping_paths.txt" || die "Cannot open $OUTPUT_DIR/overlapping_paths.txt for writing.\n"; # BLAST results are in the directory structure: # fasta/chr*/*/*.blasted # # Here collecting the list of refseqs and numbering # them (i.e. $blast_refseqs{NM_000000} = 0,1,2,3,...) # so that they are indexed in order. # Also counting the number of exons each refseq has foreach my $chr_dir (`ls fasta/chr* -d`) { chomp $chr_dir; my $source_chr = $chr_dir; $source_chr=~s/fasta\/chr(.*)/$1/; my %blast_refseqs = (); # Will hold the list of refseqs we have blast info for my %exon_count = (); my $refseq_count = 0; foreach my $dir (`ls $chr_dir`) { chomp $dir; foreach my $file (`ls $chr_dir/$dir/*.blasted`) { chomp $file; my $refseq = $file; $refseq=~s/$chr_dir\/$dir\/(.*)\.\d+\.blasted/$1/; if(!($blast_refseqs{$refseq})) { $refseq_count++; $blast_refseqs{$refseq} = $refseq_count; $exon_count{$refseq} = 1; } else { $exon_count{$refseq}++; } } } print PATHS "RefSeq Source Chromosome: $source_chr\n"; print PATHS "refseq count = $refseq_count\n"; print OPATHS "RefSeq Source Chromosome: $source_chr\n"; print OPATHS "refseq count = $refseq_count\n"; # Now sorting the list of refseqs we have blast info for my @sorted_refseqs = sort {$blast_refseqs{$a} <=> $blast_refseqs{$b}} keys %blast_refseqs; for(my $tmp=0; $tmp<@sorted_refseqs; $tmp++) { print PATHS "$sorted_refseqs[$tmp] has $exon_count{$sorted_refseqs[$tmp]} exons.\n"; print OPATHS "$sorted_refseqs[$tmp] has $exon_count{$sorted_refseqs[$tmp]} exons.\n"; my %HoN = (); # Hash of nodes foreach my $file (`ls $chr_dir/*/$sorted_refseqs[$tmp].*blasted`) { chomp $file; # Look at blast file hits and compare to structural data from UCSC open IN, "$file" || die "Cannot open $file for reading.\n"; my $exon = $file; my $refseq = $file; $exon=~s/.*\/NM_\d+\.(\d+)\.blasted/$1/; $refseq=~s/.*\/(NM_\d+)\.\d+\.blasted/$1/; my $hit_count=0; my $start; my $end; my $qstart; my $qend; my $strand; my $chr; my $length; my $percentIdentity; my $query_length; my $result_read=0; my $start_read=0; my $end_read=0; my $qstart_read=0; my $skip_next_record=0; while(my $line=) { chomp $line; if($line=~/letters\)/) { $query_length=$line; $query_length=~s/\s+\((\S+) letters\)/$1/; $query_length=~tr/,//d; } if(($line=~/>/) || ($line=~/Score/)) { if($result_read) { if(($length >= $query_length/2) || !($QUERY_LIMIT_FLAG)) { if($skip_next_record == 0) { if($query_length == 0) { print "$file\n"; } $HoN{$chr}{$exon}{$hit_count} = new Node; $HoN{$chr}{$exon}{$hit_count}->start($start); $HoN{$chr}{$exon}{$hit_count}->end($end); $HoN{$chr}{$exon}{$hit_count}->qstart($qstart); $HoN{$chr}{$exon}{$hit_count}->qend($qend); $HoN{$chr}{$exon}{$hit_count}->strand($strand); $HoN{$chr}{$exon}{$hit_count}->refseq($refseq); $HoN{$chr}{$exon}{$hit_count}->chrom($chr); $HoN{$chr}{$exon}{$hit_count}->exon($exon); $HoN{$chr}{$exon}{$hit_count}->queryLength($query_length); $HoN{$chr}{$exon}{$hit_count}->Length($length); $HoN{$chr}{$exon}{$hit_count}->coverage("$length / $query_length"); $HoN{$chr}{$exon}{$hit_count}->percentIdentity($percentIdentity); $HoN{$chr}{$exon}{$hit_count}->explored('F'); $HoN{$chr}{$exon}{$hit_count}->nextNode(0); $hit_count++; } elsif($skip_next_record == 1) { $skip_next_record = 0; } if($line=~/>/) { $skip_next_record=1; } } $start_read = 0; $end_read = 0; $qstart_read = 0; } if($line=~/>/) { $chr=$line; $chr=~s/>(.*)/$1/; } } elsif($line=~/Identities/) { $result_read=1; my $tmp=my $gaps=$line; $tmp=~s/ Identities = (\d+)\/\d+ .*/$1/; $length=$line; $length=~s/ Identities = \d+\/(\d+) .*/$1/; if($line=~/Gaps/) { $gaps=~s/ Identities = \d+\/\d+ .*Gaps \= (\d+)\/\d+ .*/$1/; $length -= $gaps; } $percentIdentity = $tmp / $length * 100; } elsif($line=~/Strand/) { if($line=~/Plus \/ Plus/) { $strand=1; } elsif($line=~/Plus \/ Minus/) { $strand =-1; } else { print "There was a problem reading the strand.\n"; } } elsif($line=~/Sbjct/) { if($strand == 1) { if(!($start_read)) { $start=$line; $start=~s/Sbjct: (\d+)\s+[\w\-]+\s+\d+/$1/; $start_read=1; } $end=$line; $end=~s/Sbjct: \d+\s+[\w\-]+\s+(\d+)/$1/; } else { if(!($end_read)) { $end=$line; $end=~s/Sbjct: (\d+)\s+[\w\-]+\s+\d+/$1/; $end_read=1; } $start=$line; $start=~s/Sbjct: \d+\s+[\w\-]+\s+(\d+)/$1/; } } elsif($line=~/Query:/) { if(!($qstart_read)) { $qstart=$line; $qstart=~s/Query: (\d+)\s+[\w\-]+\s+\d+/$1/; $qstart_read=1; } $qend=$line; $qend=~s/Query: \d+\s+[\w\-]+\s+(\d+)/$1/; } } #Storing Last Result if($result_read) { if(($length >= $query_length/2) || !($QUERY_LIMIT_FLAG)) { if($query_length == 0) { print "$file\n"; } $HoN{$chr}{$exon}{$hit_count} = new Node; $HoN{$chr}{$exon}{$hit_count}->start($start); $HoN{$chr}{$exon}{$hit_count}->end($end); $HoN{$chr}{$exon}{$hit_count}->qstart($qstart); $HoN{$chr}{$exon}{$hit_count}->qend($qend); $HoN{$chr}{$exon}{$hit_count}->strand($strand); $HoN{$chr}{$exon}{$hit_count}->refseq($refseq); $HoN{$chr}{$exon}{$hit_count}->chrom($chr); $HoN{$chr}{$exon}{$hit_count}->exon($exon); $HoN{$chr}{$exon}{$hit_count}->queryLength($query_length); $HoN{$chr}{$exon}{$hit_count}->Length($length); $HoN{$chr}{$exon}{$hit_count}->coverage("$length / $query_length"); $HoN{$chr}{$exon}{$hit_count}->percentIdentity($percentIdentity); $HoN{$chr}{$exon}{$hit_count}->explored('F'); $HoN{$chr}{$exon}{$hit_count}->nextNode(0); $hit_count++; } } close IN; } # This first set of nested loops will create the links between the nodes foreach my $i (sort keys %HoN) { # For each chromosome foreach my $j (sort {$a <=> $b} keys %{$HoN{$i}}) { # For each exon my $next_exon = $j+1; foreach my $k (sort {$a <=> $b} keys %{$HoN{$i}{$j}}) { # For each node my $smallest_50dist = 3000000000; # Upper bound on distance to a 50% hit (~3 billion bp) my $smallest_dist = 3000000000; # Upper bound on distance (~3 billion bp) my $closest_50hit = -1; my $closest_50exon; my $closest_hit = -1; my $closest_exon; my $node = new Node; $node = $HoN{$i}{$j}{$k}; foreach my $next_exon (sort {$a <=> $b} keys %{$HoN{$i}}) { # For each exon if($next_exon >= $j) { # If exon is greater than or equal to current exon foreach my $l (sort {$a <=> $b} keys %{$HoN{$i}{$next_exon}}) { # For each hit my $nextNode = new Node; $nextNode = $HoN{$i}{$next_exon}{$l}; if(($node->strand == $nextNode->strand) && ((($node->strand == 1) && ($node->start < $nextNode->start - $NON_OVERLAP) && ($node->end < $nextNode->end - $NON_OVERLAP)) || (($node->strand ==-1) && ($node->start > $nextNode->start + $NON_OVERLAP) && ($node->end > $nextNode->end + $NON_OVERLAP))) && (($next_exon > $j) || (($next_exon == $j) && ($l != $k) && ($node->qstart < $nextNode->qstart) && ($node->qend < $nextNode->qend)))) { my $dist=0; if($node->strand == 1) { $dist = $nextNode->start - $node->end; } else { $dist = $node->start - $nextNode->end; } my $orig_dist=0; if($next_exon == $j) { $orig_dist = 0; } elsif($next_exon == ($j+1)) { $orig_dist = $refseq_intron_sizes{$node->refseq.".".$node->exon}; } else { for(my $ecount=$node->exon; $ecount<($next_exon - 1); $ecount++) { $orig_dist += $refseq_intron_sizes{$node->refseq.".".$ecount} + $refseq_exon_sizes{$node->refseq.".".($ecount + 1)}; } $orig_dist += $refseq_intron_sizes{$node->refseq.".".($next_exon - 1)}; } if(($dist < $smallest_dist) || (($dist == $smallest_dist) && ($nextNode->Length > $HoN{$i}{$closest_exon}{$closest_hit}->Length))) { if(($dist <= $orig_dist + $INTRON_LIMIT) || !($INTRON_LIMIT_FLAG)) { $smallest_dist = $dist; $closest_hit = $l; $closest_exon = $next_exon; } } if(($nextNode->Length >= $nextNode->queryLength / 2) && (($dist < $smallest_50dist) || (($dist == $smallest_50dist) && ($nextNode->Length > $HoN{$i}{$closest_50exon}{$closest_50hit}->Length)))) { if(($dist <= $orig_dist + $INTRON_LIMIT) || !($INTRON_LIMIT_FLAG)) { $smallest_50dist = $dist; $closest_50hit = $l; $closest_50exon = $next_exon; } } } } } } my $cnode = new Node; my $c50node = new Node; $cnode = $HoN{$i}{$closest_exon}{$closest_hit}; $c50node = $HoN{$i}{$closest_50exon}{$closest_50hit}; if($closest_hit != -1) { if(($closest_50hit != -1) && (($closest_hit != $closest_50hit) || ($closest_exon != $closest_50exon))) { # Calculate distance between closest_hit and closest_50hit my $dist=0; if($node->strand == 1) { $dist = $c50node->start - $cnode->end; } else { $dist = $cnode->start - $c50node->end; } # If the closest hit is in the way, replace it with the closest 50% hit if(($cnode->exon > $c50node->exon) || (($node->strand == 1) && (($cnode->start >= $c50node->start - $NON_OVERLAP) || ($cnode->end >= $c50node->end - $NON_OVERLAP))) || (($node->strand ==-1) && (($cnode->start <= $c50node->start + $NON_OVERLAP) || ($cnode->end <= $c50node->end + $NON_OVERLAP))) || (($cnode->exon == $c50node->exon) && (($cnode->qstart >= $c50node->qstart) || ($cnode->qend >= $c50node->qend)))) { $cnode = $c50node; } } $node->nextNode($cnode); } } } } # This second set of nested loops will print out all the possible paths foreach my $i (sort keys %HoN) { # For each chromosome foreach my $j (sort {$a <=> $b} keys %{$HoN{$i}}) { # For each exon foreach my $k (sort {$a <=> $b} keys %{$HoN{$i}{$j}}) { # For each node my $node = new Node; $node = $HoN{$i}{$j}{$k}; # Is there another hit to this exon, that points to this one? # If so, don't start printing the path here, let the other # hit start the print out of this path. my $nodeLinkedTo=0; foreach my $l (sort {$a <=> $b} keys %{$HoN{$i}{$j}}) { if($HoN{$i}{$j}{$l}->nextNode == $node) { $nodeLinkedTo = 1; } } if(($node->explored eq 'F') && !($nodeLinkedTo)) { # Must search for fifty percent coverage in path since I am looking at all hits. # Also searching for overlapping pgenes to print them to a different file. my $fifty_percent_coverage_found = 0; my $overlapping_pgene = 0; my $multi_exon = 0; my $multi_intron = 0; my @qoverlaps = (); my @hoverlaps = (); my $last_exon = 0; while($node) { if($node->Length / $node->queryLength >= .5) { $fifty_percent_coverage_found=1; } if($last_exon && $last_exon != $node->exon) { if($multi_exon) { $multi_intron = 1; } $multi_exon = 1; } if($node->nextNode) { if(($node->exon eq $node->nextNode->exon) && ($node->qend - $node->nextNode->qstart > 0)) { push(@qoverlaps,$node->qend - $node->nextNode->qstart); $overlapping_pgene=1; } if(($node->strand == 1) && ($node->end - $node->nextNode->start > 0)) { push(@hoverlaps,$node->end - $node->nextNode->start); $overlapping_pgene=1; } if(($node->strand ==-1) && ($node->nextNode->end - $node->start > 0)) { push(@hoverlaps,$node->nextNode->end - $node->start); $overlapping_pgene=1; } } $last_exon = $node->exon; $node=$node->nextNode; } # Must reset $node after incrementing through paths. $node = $HoN{$i}{$j}{$k}; if($fifty_percent_coverage_found) { $node->explored('T'); my $path = ""; my $header = "ID:$pgeneID\t"; my $intron_count = 0; my $total_dist = 0; my $total_diff = 0; my $total_orig_dist = 0; my $total_base_coverage = $node->Length; my $total_similar_bases = $node->Length * $node->percentIdentity / 100; if($multi_exon) { $header .= "multi_exon\t"; } else { $header .= "single_exon\t"; } $path .= $node->chrom."\t".$node->exon."\t". $node->start."\t".$node->end."\t". $node->qstart."\t".$node->qend."\t". $node->coverage."\t".$node->strand."\t". $node->percentIdentity."\n"; # Only check if explored for the first node while($node->nextNode) { my $dist; if($node->strand == 1) { $dist = $node->nextNode->start - $node->end; } else { $dist = $node->start - $node->nextNode->end; } if($node->exon == $node->nextNode->exon) { # We don't want to print anything if both hits are to the same exon } elsif(($node->exon + 1) == $node->nextNode->exon) { my $orig_dist = $refseq_intron_sizes{$node->refseq.".".$node->exon}; my $difference = $dist - $orig_dist; my $diff_percent; if(($orig_dist == 0) && ($difference == 0)) { $diff_percent = 0; } elsif($orig_dist == 0) { $diff_percent = "Infinite"; } else { $diff_percent = $difference / $orig_dist; } if($multi_intron) { print MULTI_INTRONS "ID:$pgeneID\t".$node->refseq.".".$node->exon."\t". $node->nextNode->refseq.".".$node->nextNode->exon."\t". "$orig_dist\t$dist\t$difference\t$diff_percent\t*\n"; } else { print SING_INTRONS "ID:$pgeneID\t".$node->refseq.".".$node->exon."\t". $node->nextNode->refseq.".".$node->nextNode->exon."\t". "$orig_dist\t$dist\t$difference\t$diff_percent\t*\n"; } $intron_count++; if($dist > 0) { # If intron is less than or equal to zero we'll say it's zero and thus, don't need to add it $total_dist += $dist; $total_diff += $difference; } else { # But, we still need to note the difference if the intron's not there, orig_dist is completely gone $total_diff += -$orig_dist; } $total_orig_dist += $orig_dist; } else { my $orig_dist=0; for(my $ecount=$node->exon; $ecount<($node->nextNode->exon - 1); $ecount++) { $orig_dist += $refseq_intron_sizes{$node->refseq.".".$ecount} + $refseq_exon_sizes{$node->refseq.".".($ecount + 1)}; } $orig_dist += $refseq_intron_sizes{$node->refseq.".".($node->nextNode->exon - 1)}; my $difference = $dist - $orig_dist; my $diff_percent; if(($orig_dist == 0) && ($difference == 0)) { $diff_percent = 0; } elsif($orig_dist == 0) { $diff_percent = "Infinite"; } else { $diff_percent = $difference / $orig_dist; } if($multi_intron) { print MULTI_INTRONS "ID:$pgeneID\t".$node->refseq.".".$node->exon."\t". $node->nextNode->refseq.".".$node->nextNode->exon."\t". "$orig_dist\t$dist\t$difference\t$diff_percent\n"; } else { print SING_INTRONS "ID:$pgeneID\t".$node->refseq.".".$node->exon."\t". $node->nextNode->refseq.".".$node->nextNode->exon."\t". "$orig_dist\t$dist\t$difference\t$diff_percent\n"; } } $node = $node->nextNode; $node->explored('T'); $path .= $node->chrom."\t".$node->exon."\t". $node->start."\t".$node->end."\t". $node->qstart."\t".$node->qend."\t". $node->coverage."\t".$node->strand."\t". $node->percentIdentity."\n"; $total_base_coverage += $node->Length; $total_similar_bases += $node->Length * $node->percentIdentity / 100; } if($node->chrom eq $refseq_chrom{$node->refseq}) { # Check distance from last node my $dist1 = min4(abs($node->start - $refseq_txStart{$node->refseq}), abs($node->start - $refseq_txEnd{$node->refseq}), abs($node->end - $refseq_txStart{$node->refseq}), abs($node->end - $refseq_txEnd{$node->refseq})); # Then, reset to first node and check distance from there $node = $HoN{$i}{$j}{$k}; my $dist2 = min4(abs($node->start - $refseq_txStart{$node->refseq}), abs($node->start - $refseq_txEnd{$node->refseq}), abs($node->end - $refseq_txStart{$node->refseq}), abs($node->end - $refseq_txEnd{$node->refseq})); # Print the lesser of the two if($dist1 < $dist2) { $header .= $dist1; } else { $header .= $dist2; } $header .= "bp\t"; } else { $header .= "Infinite\t"; } $header .= "\t".round($total_similar_bases/$total_base_coverage*100)."%"; if($multi_exon) { if($total_orig_dist > 0) { # If, overall 90% or more of the introns have been removed or # the average intron size has reduced to 20bp or less, call it "Processed" # Otherwise, call it "Non-processed" my $percent_intron_change = round($total_diff/$total_orig_dist*100); my $average_pseudointron_size = round($total_dist/$intron_count); if(($percent_intron_change <= -90) || (($average_pseudointron_size <= 20) && ($percent_intron_change <= 0))) { $header .= "\tProcessed"; } else { $header .= "\tNon-processed"; } $header .= "\t$percent_intron_change%\t$average_pseudointron_size"."bp"; } } if($overlapping_pgene == 1) { print OPATHS "$header\n$path"; for(my $a=0; $a<@qoverlaps; $a++) { print OPATHS "Qoverlap: $qoverlaps[$a]\n"; } for(my $a=0; $a<@hoverlaps; $a++) { print OPATHS "Hoverlap: $hoverlaps[$a]\n"; } print OPATHS "\n"; } print PATHS "$header\n$path\n"; $pgeneID++; } else { while($node) { $node->explored('T'); $node=$node->nextNode; } } } } } } } } close SING_INTRONS; close MULTI_INTRONS; close PATHS; close OPATHS; sub min4 { my $a = shift; my $b = shift; my $c = shift; my $d = shift; if($a < $b) { if($a < $c) { if($a < $d) { return $a; } else { return $d; } } else { if($c < $d) { return $c; } else { return $d; } } } else { if($b < $c) { if($b < $d) { return $b; } else { return $d; } } else { if($c < $d) { return $c; } else { return $d; } } } } # This round function rounds to the nearest hundredth sub round { my $number = shift; $number *= 100; return int($number + .5 * ($number <=> 0))/100; }