Skip to content

Commit

Permalink
Extend the -O, --overlap option
Browse files Browse the repository at this point in the history
to choose the denominator with respect to the SRC or TGT.
The extension is backward compatible.
  • Loading branch information
pd3 authored and whitwham committed Jul 30, 2024
1 parent 278e23d commit f1a7ec9
Show file tree
Hide file tree
Showing 8 changed files with 57 additions and 17 deletions.
8 changes: 5 additions & 3 deletions annot-tsv.1
Original file line number Diff line number Diff line change
Expand Up @@ -170,13 +170,15 @@ Ignore the headers completely and use numeric indexes even when a header exists
Suppress index numbers in the printed header. If given twice, drop the entire header.
.RE
.PP
.BR \-O ", " \-\-overlap " FLOAT"
.BR \-O ", " \-\-overlap " FLOAT,[FLOAT]"
.RS 4
Minimum overlap as a fraction of region length in at least one of the overlapping regions. If also
Minimum overlap as a fraction of region length in SRC and TGT, respectively (with two numbers), or in
at least one of the overlapping regions (with a single number). If also
.BR \-r ", " \-\-reciprocal
is given, require at least
.I FLOAT
overlap with respect to both regions
overlap with respect to both regions. Two identical numbers are equivalent to running with
.BR \-r ", " \-\-reciprocal
.RE
.PP
.BR \-r ", " \-\-reciprocal
Expand Down
50 changes: 36 additions & 14 deletions annot-tsv.c
Original file line number Diff line number Diff line change
Expand Up @@ -105,8 +105,8 @@ typedef struct
char *core_str, *match_str, *transfer_str, *annots_str, *headers_str, *delim_str;
char *temp_dir, *out_fname;
BGZF *out_fp;
int allow_dups, reciprocal, max_annots, mode, no_write_hdr;
double overlap;
int allow_dups, max_annots, mode, no_write_hdr, overlap_either;
double overlap_src, overlap_dst;
regidx_t *idx;
regitr_t *itr;
kstring_t tmp_kstr;
Expand Down Expand Up @@ -736,18 +736,20 @@ void process_line(args_t *args, char *line, size_t size)
int has_match = 0, annot_len = 0;
while ( regitr_overlap(args->itr) )
{
if ( args->overlap )
if ( args->overlap_src || args->overlap_dst )
{
double len1 = end - beg + 1;
double len2 = args->itr->end - args->itr->beg + 1;
double len_dst = end - beg + 1;
double len_src = args->itr->end - args->itr->beg + 1;
double isec = (args->itr->end < end ? args->itr->end : end) - (args->itr->beg > beg ? args->itr->beg : beg) + 1;
if ( args->reciprocal )
int pass_dst = isec/len_dst < args->overlap_dst ? 0 : 1;
int pass_src = isec/len_src < args->overlap_src ? 0 : 1;
if ( args->overlap_either )
{
if ( isec/len1 < args->overlap || isec/len2 < args->overlap ) continue;
if ( !pass_dst && !pass_src ) continue;
}
else
{
if ( isec/len1 < args->overlap && isec/len2 < args->overlap ) continue;
if ( !pass_dst || !pass_src ) continue;
}
}
cols_t *src_cols = regitr_payload(args->itr,cols_t*);
Expand Down Expand Up @@ -885,8 +887,9 @@ static const char *usage_text(void)
" -H, --ignore-headers Use numeric indices, ignore the headers completely\n"
" -I, --no-header-idx Suppress index numbers in the printed header. If given\n"
" twice, drop the entire header\n"
" -O, --overlap FLOAT Minimum required overlap (non-reciprocal, unless -r\n"
" is given)\n"
" -O, --overlap FLOAT[,FLOAT] Minimum required overlap with respect to SRC,TGT.\n"
" If single value, the bigger overlap is considered.\n"
" Identical values are equivalent to running with -r.\n"
" -r, --reciprocal Apply the -O requirement to both overlapping\n"
" intervals\n"
" -x, --drop-overlaps Drop overlapping regions (precludes -f)\n"
Expand Down Expand Up @@ -941,6 +944,7 @@ int main(int argc, char **argv)
};
char *tmp = NULL;
int c;
int reciprocal = 0;
while ((c = getopt_long(argc, argv, "c:f:m:o:s:t:a:HO:rxh:Id:",loptions,NULL)) >= 0)
{
switch (c)
Expand All @@ -960,16 +964,24 @@ int main(int argc, char **argv)
case 'd': args->delim_str = optarg; break;
case 'h': args->headers_str = optarg; break;
case 'H': args->headers_str = "0:0"; break;
case 'r': args->reciprocal = 1; break;
case 'r': reciprocal = 1; break;
case 'c': args->core_str = optarg; break;
case 't': args->dst.fname = optarg; break;
case 'm': args->match_str = optarg; break;
case 'a': args->annots_str = optarg; break;
case 'o': args->out_fname = optarg; break;
case 'O':
args->overlap = strtod(optarg, &tmp);
if ( tmp==optarg || *tmp ) error("Could not parse --overlap %s\n", optarg);
if ( args->overlap<0 || args->overlap>1 ) error("Expected value from the interval [0,1]: --overlap %s\n", optarg);
args->overlap_src = strtod(optarg, &tmp);
if ( tmp==optarg || (*tmp && *tmp!=',') ) error("Could not parse --overlap %s\n", optarg);
if ( args->overlap_src<0 || args->overlap_src>1 ) error("Expected value(s) from the interval [0,1]: --overlap %s\n", optarg);
if ( *tmp )
{
args->overlap_dst = strtod(tmp+1, &tmp);
if ( *tmp ) error("Could not parse --overlap %s\n", optarg);
if ( args->overlap_dst<0 || args->overlap_dst>1 ) error("Expected value(s) from the interval [0,1]: --overlap %s\n", optarg);
}
else
args->overlap_either = 1;
break;
case 's': args->src.fname = optarg; break;
case 'f': args->transfer_str = optarg; break;
Expand All @@ -994,6 +1006,16 @@ int main(int argc, char **argv)
else args->mode = PRINT_MATCHING|PRINT_NONMATCHING;
}
if ( (args->transfer_str || args->annots_str) && !(args->mode & PRINT_MATCHING) ) error("The option -x cannot be combined with -f and -a\n");
if ( reciprocal )
{
if ( args->overlap_dst && args->overlap_src && args->overlap_dst!=args->overlap_src )
error("The combination of --reciprocal with --overlap %f,%f makes no sense: expected single value or identical values\n",args->overlap_src,args->overlap_dst);
if ( !args->overlap_src )
args->overlap_src = args->overlap_dst;
else
args->overlap_dst = args->overlap_src;
args->overlap_either = 0;
}

init_data(args);
write_header(args, &args->dst);
Expand Down
2 changes: 2 additions & 0 deletions test/annot-tsv/out.13.1.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
1 10 20 long long,short
1 15 15 short long,short
2 changes: 2 additions & 0 deletions test/annot-tsv/out.13.2.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
1 10 20 long long
1 15 15 short short
2 changes: 2 additions & 0 deletions test/annot-tsv/out.13.3.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
1 10 20 long long
1 15 15 short long,short
2 changes: 2 additions & 0 deletions test/annot-tsv/out.13.4.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
1 10 20 long long,short
1 15 15 short short
2 changes: 2 additions & 0 deletions test/annot-tsv/src.13.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
1 10 20 long
1 15 15 short
6 changes: 6 additions & 0 deletions test/test.pl
Original file line number Diff line number Diff line change
Expand Up @@ -1472,4 +1472,10 @@ sub test_annot_tsv
run_annot_tsv($opts,src=>'src.11.txt',dst=>'dst.11.txt',out=>'out.11.3.txt',args=>'-c chr2,beg2,end2:chr,beg,end -f smpl2:src_smpl -h 3:2 -I');
run_annot_tsv($opts,src=>'src.12.txt',dst=>'dst.12.txt',out=>'out.12.1.txt',args=>'-c 1,2,3:1,2,3 -f 4:5 -h 0:0 -d ,');
run_annot_tsv($opts,src=>'src.12.txt',dst=>'dst.11.txt',out=>'out.11.1.txt',args=>q[-c 1,2,3:1,2,3 -f 4:5 -h 0:0 -d $',:\t']);
run_annot_tsv($opts,src=>'src.13.txt',dst=>'src.13.txt',out=>'out.13.1.txt',args=>q[-c 1,2,3 -f 4:5]);
run_annot_tsv($opts,src=>'src.13.txt',dst=>'src.13.txt',out=>'out.13.1.txt',args=>q[-c 1,2,3 -f 4:5 -O 0.5]);
run_annot_tsv($opts,src=>'src.13.txt',dst=>'src.13.txt',out=>'out.13.2.txt',args=>q[-c 1,2,3 -f 4:5 -O 0.5 -r]);
run_annot_tsv($opts,src=>'src.13.txt',dst=>'src.13.txt',out=>'out.13.2.txt',args=>q[-c 1,2,3 -f 4:5 -O 0.5,0.5]);
run_annot_tsv($opts,src=>'src.13.txt',dst=>'src.13.txt',out=>'out.13.3.txt',args=>q[-c 1,2,3 -f 4:5 -O 0,1]);
run_annot_tsv($opts,src=>'src.13.txt',dst=>'src.13.txt',out=>'out.13.4.txt',args=>q[-c 1,2,3 -f 4:5 -O 1,0]);
}

0 comments on commit f1a7ec9

Please sign in to comment.