This is a simple Perl script to extract FASTA sequences from a large fasta file depending on the matching fasta headers present in another file.
For example, your fasta sequences are present in a file named, “input.fa” and the headers are in another file called “headers.txt”.
#! /usr/bin/perl
use warnings;
use strict;
my $headerfile = 'headers.txt';
my $input = 'input.fa';
open( HEADERFILE, '<', $headerfile ) or die $!;
chomp ( my @headers = map { split } <$headerfile> ); #splitting lines on whitespaces.
close HEADERFILE;
my %seqs;
open( INPUTFILE, '<', $input ) or die $!;
{
local $/ = ''; #Reading until blank line
while ( <$input> ) {
my ( $header, $sequence ) = m/>\s*(\S+)\n(.*)/ms;
$sequences{$header} = $sequence;
}
open( my $seqsfile, ">", "input.fa" );
foreach my $header (@headers) {
if ( $sequences{$header} ) {
print $header, "\n";
print $sequences{$header}, "\n";
}
}
close( $seqsfile );
}
close INPUTFILE;
exit;
