#!/usr/bin/perl use strict; # # Program to scan a FASTA-formatted sequence file for repeats of the same sequence # For use if isolated lots of clones and just one single copies. # # by Mohamed Noor, May 8, 2008. Modified May 12, 2008. # print "\n\nProgram to scan a FASTA-formatted sequence file for repeats of sequences.\n\n"; my @inp = @ARGV; # file name to scan should be in command line my $inp_file = $inp[0] or die "ERROR: Put input filename at command line.\n\n"; open (OUTR, ">$inp_file.rep"); system "cp $inp_file $inp_file.old"; # Copy original file to 'main' and use that as input open (MAIN, "<$inp_file.old"); open (OUTPUT, "> $inp_file"); my $line_read =
; my $header; # These are globally scoped. my $sequence; # These are globally scoped. my $unique_count = 1; my $repeat_count = 0; if ($line_read !~ /^>/) { print "File does not appear to be FASTA formatted.\n"; print "It does not start with a >\n"; } # end if print "Your original file will be replaced, but the original is backed up as \"$inp_file.old\".\n"; print "Output will also include \"$inp_file.rep\", which has all the sequence headers that matched\n"; print " existing sequences, and various temporary files that will be discarded.\n\n\n\n"; # # original setup with a single sequence # &get_next_seq; print OUTPUT $header; print OUTPUT $sequence."\n"; system "formatdb -i $inp_file -pF"; if (eof (MAIN)) { die "Only one sequence in the file...";} # # repeat scanning via BLAST, and if good, adding to database & making new one # until (eof (MAIN)) { &get_next_seq; open (TEMP, ">tempfile"); # make temporary file from which to BLAST this one sequence print TEMP $header; print TEMP $sequence."\n"; close TEMP; system "blastall -p blastn -d $inp_file -i tempfile -o outputfile -e 1e-5 -m8"; open (BLASTOUT, "outputfile"); if (eof(BLASTOUT)) { # check if BLASTed to nothing print OUTPUT $header; print OUTPUT $sequence."\n"; system "formatdb -i $inp_file -pF"; $unique_count++; } else { print OUTR $header; my $blast_look = ; if ($blast_look =~ /^(\S+)\s+(\S+)/) { # identify to what it matched from BLAST output print OUTR " Match to $2\n"; } else { print OUTR " Cannot identify to what it matched.\n"; } # end if/then/else blastlook $repeat_count++; } # end if (eof(BLASTOUT)) which was checking for BLAST hits } # end until (eof(MAIN)) system "rm tempfile"; # delete the garbage files this program made system "rm outputfile"; system "rm $inp_file.nhr"; # delete the garbage files BLASTALL made system "rm $inp_file.nin"; system "rm $inp_file.nsq"; print "$unique_count unique sequences identified.\n"; print "$repeat_count sequences that were discarded as repeats.\n\n"; close; sub get_next_seq { $header = $line_read; my $hit = 0; $sequence = ""; while ($hit==0) { $line_read =
; if (($line_read =~ /^>/) || (eof(MAIN))) { $hit = 1; } else { $sequence .= $line_read; chomp $sequence; $sequence =~ s/\s//g; # eliminate extra whitespace } # end if/ else } # end while ($hit==0) } # end sub get_next_seq Note, the sequence does NOT end with enter exit;