Skip to content

Commit 2ae59c5

Browse files
author
Gaurav Sablok
committed
sciencegenome
finished writing the code and had no internet connection and hence pushing it now. Final and binary release.
1 parent e138e73 commit 2ae59c5

13 files changed

+315
-67
lines changed

README.md

+62-23
Original file line numberDiff line numberDiff line change
@@ -6,29 +6,68 @@
66
77
```
88
```
9-
➜ graph-clusters git:(main) ✗ ./target/debug/graph-kmers -h
10-
graphkmers
11-
12-
Usage: graph-kmers <COMMAND>
13-
14-
Commands:
15-
sequence profile the similarity based on the observed kmer
16-
help Print this message or the help of the given subcommand(s)
17-
18-
Options:
19-
-h, --help Print help
20-
-V, --version Print version
21-
➜ graph-clusters git:(main) ✗ ./target/debug/graph-kmers sequence -h
22-
profile the similarity based on the observed kmer
23-
24-
Usage: graph-kmers sequence <SEQUENCEPATH> <SEQUENCEKMER>
25-
26-
Arguments:
27-
<SEQUENCEPATH> provide the path to sequence file
28-
<SEQUENCEKMER> provide the kmer to be profiled for the sequence similarity
29-
30-
Options:
31-
-h, --help Print help
9+
12:33:48 gauravsablok@genome graph-kmer main ? ./target/debug/graph-kmers -h
10+
graphkmers
3211
12+
Usage: graph-kmers <COMMAND>
13+
14+
Commands:
15+
sequence identity kmer similarity index
16+
filter identity kmer filter
17+
help Print this message or the help of the given subcommand(s)
18+
19+
Options:
20+
-h, --help Print help
21+
-V, --version Print version
22+
12:33:53 gauravsablok@genome graph-kmer main ? ./target/debug/graph-kmers sequence -h
23+
identity kmer similarity index
24+
25+
Usage: graph-kmers sequence <SEQUENCEPATH> <SEQUENCEKMER>
26+
27+
Arguments:
28+
<SEQUENCEPATH> provide the path to sequence file
29+
<SEQUENCEKMER> provide the kmer to be profiled for the sequence similarity
30+
31+
Options:
32+
-h, --help Print help
33+
12:33:57 gauravsablok@genome graph-kmer main ? ./target/debug/graph-kmers filter -h
34+
identity kmer filter
35+
36+
Usage: graph-kmers filter <SEQUENCE> <KMER> <THRESHOLD>
37+
38+
Arguments:
39+
<SEQUENCE> provide the path to the sequence file
40+
<KMER> sequence kmer for the identity kmer
41+
<THRESHOLD> provide the threshold
42+
43+
Options:
44+
-h, --help Print help
45+
46+
```
47+
- to run the compiled library
48+
```
49+
12:53:04 gauravsablok@genome graph-kmer main ? \
50+
./target/debug/graph-kmers filter \
51+
./samplefile/sample.fasta 4 10
52+
53+
12:56:27 gauravsablok@genome samplefile main ?
54+
./target/debug/graph-kmers sequence
55+
./samplefile/sample.fasta 4
56+
57+
58+
12:55:34 gauravsablok@genome samplefile main ? head -n 4 frequencies-t*
59+
==> frequencies-tab.txt <== filtering without the threshold
60+
"chr11:66478458" "chr11:66478459" 22 44 50.0
61+
"chr11:66478458" "chr11:66478460" 12 44 27.272728
62+
"chr11:66478458" "chr11:66478461" 8 42 19.047619
63+
"chr11:66478458" "chr11:66478462" 9 41 21.95122
64+
65+
==> frequencies-threshold.txt <== filtering after the threshold
66+
"chr11:66478458" "chr11:66478459" 22 44 50.0
67+
"chr11:66478458" "chr11:66478460" 12 44 27.272728
68+
"chr11:66478458" "chr11:66478461" 8 42 19.047619
69+
"chr11:66478458" "chr11:66478462" 9 41 21.95122
3370
```
3471

72+
Gaurav Sablok
73+

graph-kmers

13.1 MB
Binary file not shown.

graph-kmers.tar

13.2 MB
Binary file not shown.

samplefile/frequencies-tab.txt

+10
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
"chr11:66478458" "chr11:66478459" 22 44 50.0
2+
"chr11:66478458" "chr11:66478460" 12 44 27.272728
3+
"chr11:66478458" "chr11:66478461" 8 42 19.047619
4+
"chr11:66478458" "chr11:66478462" 9 41 21.95122
5+
"chr11:66478459" "chr11:66478460" 12 44 27.272728
6+
"chr11:66478459" "chr11:66478461" 8 42 19.047619
7+
"chr11:66478459" "chr11:66478462" 9 41 21.95122
8+
"chr11:66478460" "chr11:66478461" 8 42 19.047619
9+
"chr11:66478460" "chr11:66478462" 9 41 21.95122
10+
"chr11:66478461" "chr11:66478462" 11 39 28.20513

samplefile/frequencies-threshold.txt

+10
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
"chr11:66478458" "chr11:66478459" 22 44 50.0
2+
"chr11:66478458" "chr11:66478460" 12 44 27.272728
3+
"chr11:66478458" "chr11:66478461" 8 42 19.047619
4+
"chr11:66478458" "chr11:66478462" 9 41 21.95122
5+
"chr11:66478459" "chr11:66478460" 12 44 27.272728
6+
"chr11:66478459" "chr11:66478461" 8 42 19.047619
7+
"chr11:66478459" "chr11:66478462" 9 41 21.95122
8+
"chr11:66478460" "chr11:66478461" 8 42 19.047619
9+
"chr11:66478460" "chr11:66478462" 9 41 21.95122
10+
"chr11:66478461" "chr11:66478462" 11 39 28.20513

samplefile/sample.fasta

+10
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
>chr11:66478458
2+
ATGGCGGACACCCAGTACATCCTGC
3+
>chr11:66478459
4+
ATGGCGGACACCCAGTACATCCTGC
5+
>chr11:66478460
6+
ATGGCGGACACCCTATAGTACTAGA
7+
>chr11:66478461
8+
ATGGCGGACACTGACTGAATGCATA
9+
>chr11:66478462
10+
ATGGCGGACATAGATGCATGCATCA

samplefile/sequence-clusters.fasta

+10
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
"chr11:66478458" "chr11:66478459" "ATGGCGGACACCCAGTACATCCTGC" "ATGGCGGACACCCAGTACATCCTGC" 22 44 50.0
2+
"chr11:66478458" "chr11:66478460" "ATGGCGGACACCCAGTACATCCTGC" "ATGGCGGACACCCTATAGTACTAGA" 12 44 27.272728
3+
"chr11:66478458" "chr11:66478461" "ATGGCGGACACCCAGTACATCCTGC" "ATGGCGGACACTGACTGAATGCATA" 8 42 19.047619
4+
"chr11:66478458" "chr11:66478462" "ATGGCGGACACCCAGTACATCCTGC" "ATGGCGGACATAGATGCATGCATCA" 9 41 21.95122
5+
"chr11:66478459" "chr11:66478460" "ATGGCGGACACCCAGTACATCCTGC" "ATGGCGGACACCCTATAGTACTAGA" 12 44 27.272728
6+
"chr11:66478459" "chr11:66478461" "ATGGCGGACACCCAGTACATCCTGC" "ATGGCGGACACTGACTGAATGCATA" 8 42 19.047619
7+
"chr11:66478459" "chr11:66478462" "ATGGCGGACACCCAGTACATCCTGC" "ATGGCGGACATAGATGCATGCATCA" 9 41 21.95122
8+
"chr11:66478460" "chr11:66478461" "ATGGCGGACACCCTATAGTACTAGA" "ATGGCGGACACTGACTGAATGCATA" 8 42 19.047619
9+
"chr11:66478460" "chr11:66478462" "ATGGCGGACACCCTATAGTACTAGA" "ATGGCGGACATAGATGCATGCATCA" 9 41 21.95122
10+
"chr11:66478461" "chr11:66478462" "ATGGCGGACACTGACTGAATGCATA" "ATGGCGGACATAGATGCATGCATCA" 11 39 28.20513

samplefile/sequence-threshold.fasta

+10
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
"chr11:66478458" "chr11:66478459" "ATGGCGGACACCCAGTACATCCTGC" "ATGGCGGACACCCAGTACATCCTGC" 22 44 50.0
2+
"chr11:66478458" "chr11:66478460" "ATGGCGGACACCCAGTACATCCTGC" "ATGGCGGACACCCTATAGTACTAGA" 12 44 27.272728
3+
"chr11:66478458" "chr11:66478461" "ATGGCGGACACCCAGTACATCCTGC" "ATGGCGGACACTGACTGAATGCATA" 8 42 19.047619
4+
"chr11:66478458" "chr11:66478462" "ATGGCGGACACCCAGTACATCCTGC" "ATGGCGGACATAGATGCATGCATCA" 9 41 21.95122
5+
"chr11:66478459" "chr11:66478460" "ATGGCGGACACCCAGTACATCCTGC" "ATGGCGGACACCCTATAGTACTAGA" 12 44 27.272728
6+
"chr11:66478459" "chr11:66478461" "ATGGCGGACACCCAGTACATCCTGC" "ATGGCGGACACTGACTGAATGCATA" 8 42 19.047619
7+
"chr11:66478459" "chr11:66478462" "ATGGCGGACACCCAGTACATCCTGC" "ATGGCGGACATAGATGCATGCATCA" 9 41 21.95122
8+
"chr11:66478460" "chr11:66478461" "ATGGCGGACACCCTATAGTACTAGA" "ATGGCGGACACTGACTGAATGCATA" 8 42 19.047619
9+
"chr11:66478460" "chr11:66478462" "ATGGCGGACACCCTATAGTACTAGA" "ATGGCGGACATAGATGCATGCATCA" 9 41 21.95122
10+
"chr11:66478461" "chr11:66478462" "ATGGCGGACACTGACTGAATGCATA" "ATGGCGGACATAGATGCATGCATCA" 11 39 28.20513

src/args.rs

+10-1
Original file line numberDiff line numberDiff line change
@@ -9,11 +9,20 @@ pub struct CommandParse {
99

1010
#[derive(Subcommand, Debug)]
1111
pub enum Commands {
12-
/// profile the similarity based on the observed kmer
12+
/// identity kmer similarity index
1313
Sequence {
1414
/// provide the path to sequence file
1515
sequencepath: String,
1616
/// provide the kmer to be profiled for the sequence similarity
1717
sequencekmer: String,
1818
},
19+
/// identity kmer filter
20+
Filter {
21+
/// provide the path to the sequence file
22+
sequence: String,
23+
/// sequence kmer for the identity kmer
24+
kmer: String,
25+
/// provide the threshold
26+
threshold: String,
27+
},
1928
}

src/filestruct.rs

+3-10
Original file line numberDiff line numberDiff line change
@@ -7,18 +7,11 @@ pub struct Genomeiter {
77

88
#[derive(Debug, Clone, PartialOrd, PartialEq)]
99

10-
pub struct Collector {
10+
pub struct CollectIter {
1111
pub name: String,
12+
pub namenext: String,
1213
pub id: String,
13-
pub count: usize,
14-
}
15-
16-
#[derive(Debug, Clone, PartialOrd, PartialEq)]
17-
18-
pub struct ProfileKmer {
19-
pub name: String,
20-
pub sequence: Vec<String>,
14+
pub idnext: String,
2115
pub count: usize,
2216
pub shared: usize,
23-
pub ratio: usize,
2417
}

src/main.rs

+10-1
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,11 @@
11
mod args;
22
mod filestruct;
3+
mod simfilter;
34
mod similarity;
45
use crate::args::CommandParse;
56
use crate::args::Commands;
7+
use crate::simfilter::simfilterarg;
68
use crate::similarity::profilesimilarity;
7-
89
use clap::Parser;
910

1011
/*
@@ -26,5 +27,13 @@ fn main() {
2627
command
2728
);
2829
}
30+
Commands::Filter {
31+
sequence,
32+
kmer,
33+
threshold,
34+
} => {
35+
let command = simfilterarg(sequence, kmer, threshold).unwrap();
36+
println!("The filtered files have been written: {:?}", command);
37+
}
2938
}
3039
}

src/simfilter.rs

+134
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,134 @@
1+
use crate::filestruct::CollectIter;
2+
use crate::filestruct::Genomeiter;
3+
use std::collections::HashSet;
4+
use std::error::Error;
5+
use std::fs::File;
6+
use std::io::{BufRead, BufReader, Write};
7+
/*
8+
9+
Author Gaurav Sablok
10+
SLB Potsdam
11+
Date: 2025-2-7
12+
13+
profiling the kmer on the sequences and then based on the
14+
sequence kmers implementing a similarity ratio and that says
15+
16+
kmerratio = observer unique kmer /
17+
number of the total unique kmer * 100
18+
*/
19+
20+
pub fn simfilterarg(path: &str, kmer: &str, threshold: &str) -> Result<String, Box<dyn Error>> {
21+
let fileopen = File::open(path).expect("file not found");
22+
let fileread = BufReader::new(fileopen);
23+
let mut sequencevector: Vec<String> = Vec::new();
24+
let mut headervector: Vec<String> = Vec::new();
25+
let thresh: f32 = threshold.parse::<f32>().unwrap();
26+
for i in fileread.lines() {
27+
let line = i.expect("file not found");
28+
if line.starts_with(">") {
29+
headervector.push(line.replace(">", ""));
30+
} else if !line.starts_with(">") {
31+
sequencevector.push(line);
32+
}
33+
}
34+
let mut combinedinfo: Vec<Genomeiter> = Vec::new();
35+
for i in 0..headervector.len() {
36+
combinedinfo.push(Genomeiter {
37+
header: headervector[i].clone(),
38+
sequence: sequencevector[i].clone(),
39+
})
40+
}
41+
42+
let mut seqbtreemap: Vec<(String, (String, Vec<String>))> = Vec::new();
43+
for i in combinedinfo.iter() {
44+
let windowkmer: Vec<_> = i
45+
.sequence
46+
.chars()
47+
.map(String::from)
48+
.collect::<Vec<_>>()
49+
.windows(kmer.parse::<usize>().unwrap())
50+
.map(|x| x.join("").to_string())
51+
.collect::<Vec<_>>();
52+
let sequencehash: Vec<String> = windowkmer
53+
.into_iter()
54+
.collect::<HashSet<String>>()
55+
.into_iter()
56+
.collect::<Vec<_>>();
57+
seqbtreemap.push((i.header.clone(), (i.sequence.clone(), sequencehash)));
58+
}
59+
60+
let mut newbase: Vec<Vec<CollectIter>> = Vec::new();
61+
for i in 0..seqbtreemap.len() - 1 {
62+
let mut shared: Vec<CollectIter> = Vec::new();
63+
let vec = seqbtreemap[i].clone();
64+
let restvec: Vec<_> = seqbtreemap[i + 1usize..seqbtreemap.len()]
65+
.iter()
66+
.collect::<Vec<_>>();
67+
for restvectiter in 0..restvec.len() {
68+
let mut countkmer: usize = 0usize;
69+
for itercount in 0..vec.1 .1.len() {
70+
if restvec[restvectiter]
71+
.1
72+
.1
73+
.contains(&vec.1 .1[itercount].to_string())
74+
{
75+
countkmer += 1usize;
76+
}
77+
}
78+
shared.push(CollectIter {
79+
name: vec.0.clone(),
80+
namenext: restvec[restvectiter].0.clone(),
81+
id: vec.1 .0.clone(),
82+
idnext: restvec[restvectiter].1 .0.clone(),
83+
count: countkmer,
84+
shared: vec.1 .1.len() + restvec[restvectiter].1 .1.len(),
85+
});
86+
}
87+
newbase.push(shared);
88+
}
89+
90+
let mut filewrite = File::create("sequence-threshold.fasta").expect("file not found");
91+
for i in newbase.iter() {
92+
for j in i.iter() {
93+
let calibrationpoint: f32 = j.count as f32 / j.shared as f32 * 100.0;
94+
if calibrationpoint >= thresh {
95+
writeln!(
96+
filewrite,
97+
"{:?}\t{:?}\t{:?}\t{:?}\t{:?}\t{:?}\t{:?}",
98+
j.name,
99+
j.namenext,
100+
j.id,
101+
j.idnext,
102+
j.count,
103+
j.shared,
104+
j.count as f32 / j.shared as f32 * 100.0
105+
)
106+
.expect("file not found");
107+
}
108+
}
109+
}
110+
111+
let mut filesecondwrite = File::create("frequencies-threshold.txt").expect("file not present");
112+
for i in newbase.iter() {
113+
for j in i.iter() {
114+
let calibrationpoint: f32 = j.count as f32 / j.shared as f32 * 100.0;
115+
if calibrationpoint >= thresh {
116+
writeln!(
117+
filesecondwrite,
118+
"{:?}\t{:?}\t{:?}\t{:?}\t{:?}",
119+
j.name,
120+
j.namenext,
121+
j.count,
122+
j.shared,
123+
j.count as f32 / j.shared as f32 * 100.0
124+
)
125+
.expect("file not found");
126+
}
127+
}
128+
}
129+
130+
Ok(
131+
"The sequence similarity scores and the cluster of the sequences have been written"
132+
.to_string(),
133+
)
134+
}

0 commit comments

Comments
 (0)