-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathload_IMG_info.pl
executable file
·141 lines (121 loc) · 4.14 KB
/
load_IMG_info.pl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
#!/usr/bin/perl
use strict;
use lib $ENV{SCRIPTS};
use ENV;
use Getopt::Std;
use DBI;
my %arg;
&getopts('i:D:p:h', \%arg);
my $source = "IMG";
my $rank = 9;
if ($arg{'h'}) {
print STDERR "load_IMG_info.pl -i [inputfile] -D [ENVdatabase] -p [dbpswd]
This program loads particular information from a tab-delimited version of the IMG .info file
into the feature_annotations and feature_evidence tables in an ENV-schema database.
Datatypes loaded: Product_name, EC, COG, KO
The program does NOT delete or update any existing data in the database. It only inserts new rows.
It does, however, avoid inserting duplicate KO accessions.
Accessions that are missing or are duplicate in the feature_accessions table are reported in
separate logfiles (load_IMG_info.[pid].missing and load_IMG_info.[pid].duplicate
";
exit;
}
my $file = $arg{'i'};
my $db = $arg{'D'};
my $pswd = $arg{'p'};
my $host = $ENV{DBSERVER} ? $ENV{DBSERVER} : 'localhost';
my $dbh = DBI->connect("dbi:mysql:host=$host;db=$db", $ENV{USER}, $pswd);
open(my $in, $file) or die "Can't open $file for read: $!\n";
# Start by reading data into a structure
my %DATA;
while (my $line = <$in>) {
chomp $line;
my ($oid,
$locus,
$source,
$cluster_info,
$gene_info,
$evalue) = split/\t/, $line;
if ($oid eq "gene_oid") { next }
if ($source eq "Product_name") {
$DATA{$locus}->{'product'} = $gene_info;
} elsif ($source =~ /^EC\:(\S+)/) {
push @{$DATA{$locus}->{'ec'}}, $1;
} elsif ($source =~ /^(COG\d{4})/) {
push @{$DATA{$locus}->{'COG'}}, [$1, $evalue];
} elsif ($source =~ /^KO\:(K\d{5})/) {
push @{$DATA{$locus}->{'KO'}}, [$1, $evalue];
} elsif ($source eq "Protein_length") {
$gene_info =~ s/aa//;
$DATA{$locus}->{'protlen'} = $gene_info;
}
}
# Test for missing data
my $fidq = "SELECT feature_id FROM feature_accessions WHERE accession=?";
my $fsth = $dbh->prepare($fidq);
my $missing; my $mc;
my $duplicate; my $dc;
foreach my $acc (keys %DATA) {
$fsth->execute($acc);
my $rows = $fsth->fetchall_arrayref;
if (scalar @$rows == 0) {
$missing .= "$acc\n";
$mc++;
} elsif (scalar @$rows > 1) {
$duplicate .= "$acc";
foreach my $idr (@$rows) {
$duplicate .= "\t$idr->[0]";
}
$duplicate .= "\n";
$dc++;
} else {
$DATA{$acc}->{'feature_id'} = $rows->[0]->[0];
}
}
if ($missing) {
open my $out, ">load_IMG_info.$$.missing";
print $out $missing;
close $out;
warn "$mc accessions were missing from the db - see load_IMG_info.$$.missing for details.\n";
}
if ($duplicate) {
open my $out, ">load_IMG_info.$$.duplicate";
print $out $duplicate;
close $out;
warn "$dc accessions are assigned to more than one feature_id in the db - see load_IMG_info.$$.duplicate for details.\n";
}
# Now load the info
my $anni = "INSERT INTO feature_annotations"
. " (feature_id, data_type_id, value, rank, source, edit_by)"
. " VALUES (?, ?, ?, '$rank', '$source', USER())";
my $annsth = $dbh->prepare($anni);
my $evi = "INSERT INTO feature_evidence"
. " (feature_id, feat_min, feat_max, program, ev_type, ev_accession, ev_min, ev_max, ev_length, score)"
. " VALUES (?, ?, ?, 'IMG_pipeline', ?, ?, ?, ?, ?, ?)";
my $evsth = $dbh->prepare($evi);
foreach my $acc (keys %DATA) {
my $fid = $DATA{$acc}->{'feature_id'};
if (!$fid) { next }
$annsth->execute($fid, 66, $DATA{$acc}->{'product'});
$annsth->execute($fid, 1, join(" ", @{$DATA{$acc}->{'ec'}})) if (defined $DATA{$acc}->{'ec'});
if (defined $DATA{$acc}->{'COG'}) {
foreach my $row (@{$DATA{$acc}->{'COG'}}) {
my ($evacc, $evalue) = @$row;
$evsth->execute($fid, 1, $DATA{$acc}->{'protlen'}, 'COG', $evacc, 0, 0, 0, $evalue);
}}
if (defined $DATA{$acc}->{'KO'}) {
foreach my $row (@{$DATA{$acc}->{'KO'}}) {
my ($evacc, $evalue) = @$row;
if (&check_ko($dbh, $fid, $evacc)) {
$evsth->execute($fid, 1, $DATA{$acc}->{'protlen'}, 'KO', $evacc, 0, 0, 0, $evalue);
}
}}
}
sub check_ko {
my $dbh = shift;
my $fid = shift;
my $koacc = shift;
my $q = "SELECT count(*) from feature_evidence where feature_id=$fid AND ev_accession=\"$koacc\"";
my @r = $dbh->selectrow_array($q);
if ($r[0] > 0) { return 0 } else { return 1 }
}