-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathcheck_orf_to_contig.pl
executable file
·95 lines (63 loc) · 2.07 KB
/
check_orf_to_contig.pl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
use strict;
use warnings;
my ($in_orfs_to_contigs_tsv, $in_correct_orfs_to_contigs) = @ARGV;
my %orfs_to_contigs;
my %orfs_to_contigs_as_should;
my $raw_num_orfs = parse_orfs_to_contig($in_orfs_to_contigs_tsv, \%orfs_to_contigs);
my $raw_num_orfs_as_should = parse_orfs_to_contig($in_correct_orfs_to_contigs, \%orfs_to_contigs_as_should);
if ($raw_num_orfs != $raw_num_orfs_as_should)
{
die "not the same number of non-unique records!";
}
if (scalar(keys %orfs_to_contigs) != scalar(keys %orfs_to_contigs_as_should))
{
die "not the same number of unique records!";
}
compare_hash_to_hash(\%orfs_to_contigs_as_should, \%orfs_to_contigs);
print "ALL OKAY!\n";
sub compare_hash_to_hash
{
my ($orfs_to_conts_hash1_ref, $orfs_to_conts_hash2_ref) = @_;
my %orfs_to_conts_hash1 = %{$orfs_to_conts_hash1_ref};
my %orfs_to_conts_hash2 = %{$orfs_to_conts_hash2_ref};
foreach my $key1 (keys %orfs_to_conts_hash1)
{
if (! exists $orfs_to_conts_hash2{$key1})
{
die "$key1 does not exist in target hash!\n";
}
}
}
sub parse_orfs_to_contig
{
my ($in_orfs_to_contigs_file, $orfs_to_conts_hash_ref) = @_;
# can deal with two formats:
# 0 1 1.000 0.000E+00 0 71 72 2 73 1494 0M
# 0 2+71 71 my_multi_exon_contig 1 1.000 0.000E+00 0 71 72 2 73 1494 0M
open (my $in, "<", $in_orfs_to_contigs_file) or die "could not open $in_orfs_to_contigs_file for reading";
my $line = <$in>;
my $num_records = 0;
while (defined $line)
{
$line =~ s/\x00//; # remove null byte
chomp($line);
if ($line ne '')
{
my @line_parts = split(/\s+/, $line);
my $num_elements = @line_parts;
if ($line_parts[$num_elements - 1] eq "0M")
{
pop @line_parts;
$num_elements--;
}
# only these matter: 0 71 72 2 73 1494
my @line_parts_that_matter = @line_parts[($num_elements - 6)..($num_elements - 1)];
$line = join("\t", @line_parts_that_matter);
$orfs_to_conts_hash_ref->{$line} = 1;
$num_records++;
}
$line = <$in>;
}
close ($in);
return ($num_records);
}