#!/usr/bin/env perl #-------------------------------------------------------------------------------------------------------# # Made by Gabriel Abud # # Creation of this script was funded in part by the Ohio Research Internship Project (ORIP). # # It may be freely distributed under GNU General Public License. # # This script takes a blast output file and a contig or genome file as arguments (in that order). # # It then parses through the blast file to check for good matches # # Those matches are then used back into the genome file, and the sequence in the genome # # is displayed (to STDOUT) # # To redirect the output to a file, use the Unix '>' operator # # Note: If you want to see the output in a Unix shell, please be sure to maximize the terminal # # before hand. # # # # Changes: # # Optional flags for maximum e-value, amount of hits shown, and promoter region were added # # Use --help for more information # #-------------------------------------------------------------------------------------------------------# use warnings; # Modules use lib 'C:\\Perl\\lib'; # Depending on where you're perl libraries are located use lib 'C:\\Perl\\lib\\Bio'; # "" use Bio::SearchIO; use Bio::Seq; # use File::Basename; # Variables my $line; my @scaffolds; my $inputFile; my $scaffoldFind; my $baseList; my @start; my @end; my @strand; my @arrayBases; my $query_name; my @query_names; my $baseFile; my $e_value; my $baseProg; my $matches; my $total_size; my $title; $baseProg = $0; sub syntax { print STDERR "ERROR: Invalid arguments\n"; print STDERR "See '$baseProg --help' for more information\n"; exit; } # Help screen if( defined($ARGV[0]) && "$ARGV[0]" =~ /^-?-?help$/i ) { print "\n$baseProg:\n\n\n"; print "Syntax:\n\t$baseProg blast_output_file contig_or_genome_file [OPTIONS]\n\n"; print "Options:\n"; print "\t-e, maximum e-value for matches (0.01 by default)\n\n"; print "\t-p, base pairs of promoter region to be included (should only be used in DNA sequences)\n\n"; print "\t-n, number of top hits to display, starting with the highest hit (1 by default)\n\n"; exit } # Checks for correct amount of arguments if( @ARGV < 2 ) { print STDERR "USAGE: $baseProg blast_output_file contig_or_genome_file [OPTIONS]\n"; print STDERR "Type '$baseProg --help' for more information\n"; exit; } # Defaults $e_value = 0.01; # Default e-value (if none specified) $matches = 1; # Default # of matches printed # $promoter is undefined by default # Checks for options my $n = 0; for( @ARGV ) { if( $n eq "p" ) { if( !($_ =~ /^[0-9]+$/) ) { &syntax; } $promoter = $_; $n = 0; } elsif( $n eq "e" ) { if( !($_ =~ /^[0-9]*.?[0-9]+$/) ) { &syntax; } $e_value = $_; $n = 0; } elsif( $n eq "n" ) { if( !($_ =~ /^[0-9]+$/) ) { &syntax; } $matches = $_; $n = 0; } elsif( $_ eq "-p" ) { $n = "p"; } elsif( $_ eq "-e" ) { $n = "e"; } elsif( $_ eq "-n" ) { $n = "n"; } } ( $blastFile, $inputFile ) = @ARGV; # Sets the first two arguments to the $blastFile and $inputFile variables # Class used to search through the blast file my $in = Bio::SearchIO->new(-format => 'blast', -file => $blastFile); # Creates arrays of all the scaffold names and the coordinates of where those scaffolds are found in the contig or genome files $n = 0; while ( my $result = $in->next_result) { $query_name = $result->query_description; # Gets the query description (there should only be one) LOOP: while( my $hit = $result->next_hit ) { while ( $hsp = $hit->next_hsp ) { if ( $hsp->evalue <= $e_value ) { # Finds all matches with an evalue <= $e_value (or 0.01 by default) push(@query_names, $query_name); ( $start[$n], $end[$n] ) = $hsp->range('hit'); $scaffolds[$n] = $hit->name, $strand[$n] = $hsp->strand('hit'); $n += 1; } if( $n >= $matches ) { # Exits after the amount of matches have been found (1 by default) last LOOP; } } } } $baseFile = $inputFile; my $m; for $m ( 0..$#scaffolds ) { # Runs a loop as many times as there are scaffolds # Handle file input open INFILE, "<$inputFile" or die "\nSorry, can't open your input file: $inputFile\n$!"; $baseList = ""; $scaffoldFind = 0; $n = 0; while ($line = ) { # Returns true if the scaffold selected is found in vaiable $line if (($scaffoldFind == 0) && ($line =~ /$scaffolds[$m]/)) { $scaffoldFind++; } # Enter below if the line does not have the character > and the correct scaffold was already found elsif (($line !~ />/) && ($scaffoldFind == 1)) { chomp($line); $baseList .= "$line"; } # Exit after reaching a blank line or a line starting with > elsif ( (($line =~ /^\s*$/) || ($line =~ />/)) && ($scaffoldFind == 1)) { last; # Exit out of loop } } print ">$scaffolds[$m] ", pop(@query_names), " ($baseFile) at $start[$m] - $end[$m]"; # Print title line for each scaffold $seq_obj = Bio::Seq->new(-seq => "$baseList"); if( defined($promoter) && $strand[$m] == 1 ) { # If a promoter region was specified as an argument AND the strand is +/+ print " Promoter = $promoter"; if( $promoter >= $start[$m] ) { # If promoter is bigger than the contig, just start at the start of the contig print " !!Promoter is too big for seq!"; print STDERR "ERROR: Promoter is too big (Max promoter = ", $start[$m] - 1 , " )\n"; print STDERR "Showing sequence without promoter region...\n"; $baseList = $seq_obj->subseq($start[$m], $end[$m]); } else { $baseList = $seq_obj->subseq(($start[$m] - $promoter), $end[$m]); } } elsif( $strand[$m] == 1 && !defined($promoter) ) { $baseList = $seq_obj->subseq($start[$m], $end[$m]); } $rev_obj = Bio::Seq->new(-seq => "$baseList"); # Checks to see if the hit sequence is the reverse compliment # If it is, it changes it so that it matches the query sequence if( $strand[$m] == -1 && defined($promoter) ) { # If a promoter region was specified AND the strand is +/- print " Promoter = $promoter"; print " (showing the reverse compliment)"; # Lets you know that this is the reverse compliment of the real sequence $total_size = length($seq_obj->seq); if( ( $promoter + $end[$m] ) > $total_size ) { # If promoter is bigger than the contig, print an error and don't display promoter print " !!Promoter is too big for seq!"; print STDERR "ERROR: Promoter is too big (Max promoter = ", $total_size - $end[$m] , " )\n"; print STDERR "Showing sequence without promoter region...\n"; $baseList = $seq_obj->subseq($start[$m], $end[$m]); } else { $baseList = $seq_obj->subseq($start[$m], ( $end[$m] + $promoter )); } print "\n"; $rev_obj = Bio::Seq->new(-seq => "$baseList"); $rev_obj = $rev_obj->revcom; } elsif ( $strand[$m] == -1 && !defined($promoter) ) { print " (showing the reverse compliment)\n"; $baseList = $seq_obj->subseq($start[$m], $end[$m]); $rev_obj = Bio::Seq->new(-seq => "$baseList"); $rev_obj = $rev_obj->revcom; } else { print "\n"; } $baseList = $rev_obj->seq; # Assigns the new sequence to $baseList # Prints out the bases in the scaffold @arrayBases = split ( //, $baseList); for( @arrayBases ) { print $_; if( ($n + 1) % 70 == 0 ) { # Adds a new line after every 70 bases for more readable and managable output print "\n"; } $n++; } print "\n\n"; close INFILE; } # End of major for loop