#!/usr/bin/perl use strict; # # Program to scan a FASTA-formatted 454 sequence file for usable microsatellites # by Mohamed Noor, May 7, 2008 , modified August 7, 2008 # print "\n\nProgram to scan a FASTA-formatted 454 sequence file for microsatellites.\n\n"; print "Scans specifically for di- and trinucleotide repeats.\n\n\n"; print "Enter filename from which to scan for repeats: "; my $inp_file = <>; print "Enter filename to output: "; my $out_file = <>; print "\nEnter minimum number of repeat units: "; my $repeats = <>; chomp $repeats; print "Enter minimum number bases flanking sequence: "; my $flank = <>; chomp $flank; open (SEQDATA, "< $inp_file") or die ("Input file not found...\n"); open (SEQOUT, "> $out_file"); my $total_seq=0; my $microhits=0; my $line_read = ; if ($line_read !~ /^>/) { print "File does not appear to be FASTA formatted.\n"; print "It does not start with a >\n"; } # end if my $end_tester = 1; while ($end_tester==1) { # # Data entry phase # my $header = $line_read; my $hit = 0; my $sequence = ""; while ($hit==0) { $line_read = ; if (($line_read =~ /^>/) || (eof(SEQDATA))) { $hit = 1; } else { $sequence .= $line_read; chomp $sequence; $sequence =~ s/\s//g; # eliminate extra whitespace } # end if/ else $total_seq++; } # end while ($hit==0) # # Data entered: scan for microsats phase # if ( ($sequence =~ /\S{$flank,}[^AN](CA){$repeats,}[^CN]\S{$flank,}/) || ($sequence =~ /\S{$flank,}[^TN](GT){$repeats,}[^GN]\S{$flank,}/) || ($sequence =~ /\S{$flank,}[^AN](AA){$repeats,}[^AN]\S{$flank,}/) || ($sequence =~ /\S{$flank,}[^CN](AC){$repeats,}[^AN]\S{$flank,}/) || ($sequence =~ /\S{$flank,}[^GN](AG){$repeats,}[^AN]\S{$flank,}/) || ($sequence =~ /\S{$flank,}[^TN](AT){$repeats,}[^AN]\S{$flank,}/) || ($sequence =~ /\S{$flank,}[^CN](CC){$repeats,}[^CN]\S{$flank,}/) || ($sequence =~ /\S{$flank,}[^GN](CG){$repeats,}[^CN]\S{$flank,}/) || ($sequence =~ /\S{$flank,}[^TN](CT){$repeats,}[^CN]\S{$flank,}/) || ($sequence =~ /\S{$flank,}[^AN](GA){$repeats,}[^GN]\S{$flank,}/) || ($sequence =~ /\S{$flank,}[^CN](GC){$repeats,}[^GN]\S{$flank,}/) || ($sequence =~ /\S{$flank,}[^GN](GG){$repeats,}[^GN]\S{$flank,}/) || ($sequence =~ /\S{$flank,}[^AN](TA){$repeats,}[^TN]\S{$flank,}/) || ($sequence =~ /\S{$flank,}[^CN](TC){$repeats,}[^TN]\S{$flank,}/) || ($sequence =~ /\S{$flank,}[^GN](TG){$repeats,}[^TN]\S{$flank,}/) || ($sequence =~ /\S{$flank,}[^TN](TT){$repeats,}[^TN]\S{$flank,}/) || ($sequence =~ /\S{$flank,}[^CN](AAC){$repeats,}[^AN]\S{$flank,}/) || ($sequence =~ /\S{$flank,}[^GN](AAG){$repeats,}[^AN]\S{$flank,}/) || ($sequence =~ /\S{$flank,}[^TN](AAT){$repeats,}[^AN]\S{$flank,}/) || ($sequence =~ /\S{$flank,}[^AN](ACA){$repeats,}[^AN]\S{$flank,}/) || ($sequence =~ /\S{$flank,}[^CN](ACC){$repeats,}[^AN]\S{$flank,}/) || ($sequence =~ /\S{$flank,}[^GN](ACG){$repeats,}[^AN]\S{$flank,}/) || ($sequence =~ /\S{$flank,}[^TN](ACT){$repeats,}[^AN]\S{$flank,}/) || ($sequence =~ /\S{$flank,}[^AN](AGA){$repeats,}[^AN]\S{$flank,}/) || ($sequence =~ /\S{$flank,}[^CN](AGC){$repeats,}[^AN]\S{$flank,}/) || ($sequence =~ /\S{$flank,}[^TN](AGT){$repeats,}[^AN]\S{$flank,}/) || ($sequence =~ /\S{$flank,}[^GN](AGG){$repeats,}[^AN]\S{$flank,}/) || ($sequence =~ /\S{$flank,}[^AN](ATA){$repeats,}[^AN]\S{$flank,}/) || ($sequence =~ /\S{$flank,}[^CN](ATC){$repeats,}[^AN]\S{$flank,}/) || ($sequence =~ /\S{$flank,}[^GN](ATG){$repeats,}[^AN]\S{$flank,}/) || ($sequence =~ /\S{$flank,}[^TN](ATT){$repeats,}[^AN]\S{$flank,}/) || ($sequence =~ /\S{$flank,}[^AN](CAA){$repeats,}[^CN]\S{$flank,}/) || ($sequence =~ /\S{$flank,}[^CN](CAC){$repeats,}[^CN]\S{$flank,}/) || ($sequence =~ /\S{$flank,}[^GN](CAG){$repeats,}[^CN]\S{$flank,}/) || ($sequence =~ /\S{$flank,}[^TN](CAT){$repeats,}[^CN]\S{$flank,}/) || ($sequence =~ /\S{$flank,}[^AN](CCA){$repeats,}[^CN]\S{$flank,}/) || ($sequence =~ /\S{$flank,}[^GN](CCG){$repeats,}[^CN]\S{$flank,}/) || ($sequence =~ /\S{$flank,}[^TN](CCT){$repeats,}[^CN]\S{$flank,}/) || ($sequence =~ /\S{$flank,}[^AN](CGA){$repeats,}[^CN]\S{$flank,}/) || ($sequence =~ /\S{$flank,}[^CN](CGC){$repeats,}[^CN]\S{$flank,}/) || ($sequence =~ /\S{$flank,}[^GN](CGG){$repeats,}[^CN]\S{$flank,}/) || ($sequence =~ /\S{$flank,}[^TN](CGT){$repeats,}[^CN]\S{$flank,}/) || ($sequence =~ /\S{$flank,}[^AN](CTA){$repeats,}[^CN]\S{$flank,}/) || ($sequence =~ /\S{$flank,}[^CN](CTC){$repeats,}[^CN]\S{$flank,}/) || ($sequence =~ /\S{$flank,}[^GN](CTG){$repeats,}[^CN]\S{$flank,}/) || ($sequence =~ /\S{$flank,}[^TN](CTT){$repeats,}[^CN]\S{$flank,}/) || ($sequence =~ /\S{$flank,}[^AN](GAA){$repeats,}[^GN]\S{$flank,}/) || ($sequence =~ /\S{$flank,}[^GN](GAG){$repeats,}[^GN]\S{$flank,}/) || ($sequence =~ /\S{$flank,}[^CN](GAC){$repeats,}[^GN]\S{$flank,}/) || ($sequence =~ /\S{$flank,}[^TN](GAT){$repeats,}[^GN]\S{$flank,}/) || ($sequence =~ /\S{$flank,}[^AN](GCA){$repeats,}[^GN]\S{$flank,}/) || ($sequence =~ /\S{$flank,}[^CN](GCC){$repeats,}[^GN]\S{$flank,}/) || ($sequence =~ /\S{$flank,}[^GN](GCG){$repeats,}[^GN]\S{$flank,}/) || ($sequence =~ /\S{$flank,}[^TN](GCT){$repeats,}[^GN]\S{$flank,}/) || ($sequence =~ /\S{$flank,}[^AN](GGA){$repeats,}[^GN]\S{$flank,}/) || ($sequence =~ /\S{$flank,}[^CN](GGC){$repeats,}[^GN]\S{$flank,}/) || ($sequence =~ /\S{$flank,}[^TN](GGT){$repeats,}[^GN]\S{$flank,}/) || ($sequence =~ /\S{$flank,}[^AN](GTA){$repeats,}[^GN]\S{$flank,}/) || ($sequence =~ /\S{$flank,}[^CN](GTC){$repeats,}[^GN]\S{$flank,}/) || ($sequence =~ /\S{$flank,}[^GN](GTG){$repeats,}[^GN]\S{$flank,}/) || ($sequence =~ /\S{$flank,}[^TN](GTT){$repeats,}[^GN]\S{$flank,}/) || ($sequence =~ /\S{$flank,}[^AN](TAA){$repeats,}[^TN]\S{$flank,}/) || ($sequence =~ /\S{$flank,}[^CN](TAC){$repeats,}[^TN]\S{$flank,}/) || ($sequence =~ /\S{$flank,}[^GN](TAG){$repeats,}[^TN]\S{$flank,}/) || ($sequence =~ /\S{$flank,}[^TN](TAT){$repeats,}[^TN]\S{$flank,}/) || ($sequence =~ /\S{$flank,}[^AN](TCA){$repeats,}[^TN]\S{$flank,}/) || ($sequence =~ /\S{$flank,}[^CN](TCC){$repeats,}[^TN]\S{$flank,}/) || ($sequence =~ /\S{$flank,}[^GN](TCG){$repeats,}[^TN]\S{$flank,}/) || ($sequence =~ /\S{$flank,}[^TN](TCT){$repeats,}[^TN]\S{$flank,}/) || ($sequence =~ /\S{$flank,}[^AN](TGA){$repeats,}[^TN]\S{$flank,}/) || ($sequence =~ /\S{$flank,}[^CN](TGC){$repeats,}[^TN]\S{$flank,}/) || ($sequence =~ /\S{$flank,}[^GN](TGG){$repeats,}[^TN]\S{$flank,}/) || ($sequence =~ /\S{$flank,}[^TN](TGT){$repeats,}[^TN]\S{$flank,}/) || ($sequence =~ /\S{$flank,}[^AN](TTA){$repeats,}[^TN]\S{$flank,}/) || ($sequence =~ /\S{$flank,}[^CN](TTC){$repeats,}[^TN]\S{$flank,}/) || ($sequence =~ /\S{$flank,}[^GN](TTG){$repeats,}[^TN]\S{$flank,}/) ){ print SEQOUT $header; print SEQOUT $sequence."\n"; $microhits++; } # end if if (eof(SEQDATA)) { $end_tester=0; } # end if } # end while ($end_tester==1) print "\nTotal number of scanned sequences: $total_seq\n"; print "Total number of microsatellites identified: $microhits\n\n\n"; close; exit;