Skip to content

Commit

Permalink
Handling spurious breakpoints
Browse files Browse the repository at this point in the history
  • Loading branch information
dubssieg committed Jan 9, 2025
1 parent 465f9a6 commit dd78205
Show file tree
Hide file tree
Showing 3 changed files with 102 additions and 16 deletions.
46 changes: 31 additions & 15 deletions src/compute_distance.rs
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,8 @@ pub fn distance(
path_positions2: HashMap<String, u64>,
path_lengths1: HashMap<String, u64>,
path_lengths2: HashMap<String, u64>,
mut spurious_breakpoints1: Vec<String>,
mut spurious_breakpoints2: Vec<String>,
) -> io::Result<()> {
/*
Given two GFA files and their associated node sizes and path positions, this function computes the distance between the two graphs.
Expand All @@ -37,15 +39,16 @@ pub fn distance(
println!("## {}\t{}", path_name, path_lengths1[path_name.as_str()]);
}

let mut equivalences_count = 0;
let mut merges_count = 0;
let mut splits_count = 0;
let mut equivalences_count: i32 = 0;
let mut merges_count: i32 = 0;
let mut splits_count: i32 = 0;
let mut spurious_count: i32 = 0;

println!("# Path name\tPosition\tOperation\tNodeA\tNodeB\tBreakpointA\tBreakpointB");
for path_name in intersection {
// We get the positions of the path in the two files
let pos1 = path_positions1[path_name];
let pos2 = path_positions2[path_name];
let pos1: u64 = path_positions1[path_name];
let pos2: u64 = path_positions2[path_name];

// We open the two files
let mut file1: BufReader<File> = BufReader::new(File::open(file_path1)?);
Expand Down Expand Up @@ -95,21 +98,33 @@ pub fn distance(
// It is a split operation
splits_count += 1;
// The two positions in the two paths are not aligned
println!(
"{}\t{}\tS\t{}\t{}\t{}\t{}",
path_name, position, node1, node2, breakpoint_a, breakpoint_b
);
if spurious_breakpoints2.contains(&node2) {
// Remove the spurious breakpoint from the vector
spurious_breakpoints2.retain(|x| x != &node2);
spurious_count += 1;
} else {
println!(
"{}\t{}\tS\t{}\t{}\t{}\t{}",
path_name, position, node1, node2, breakpoint_a, breakpoint_b
);
}
node1 = read_next_node(&mut file1, &mut buffer1);
breakpoint_a += node_sizes1.get(&node1).unwrap().clone();
} else if breakpoint_a > breakpoint_b {
// The node in the second path is missing in the first path
// It is a merge operation
merges_count += 1;
// The two positions in the two paths are not aligned
println!(
"{}\t{}\tM\t{}\t{}\t{}\t{}",
path_name, position, node1, node2, breakpoint_a, breakpoint_b
);
if spurious_breakpoints1.contains(&node1) {
// Remove the spurious breakpoint from the vector
spurious_breakpoints1.retain(|x| x != &node1);
spurious_count += 1;
} else {
println!(
"{}\t{}\tM\t{}\t{}\t{}\t{}",
path_name, position, node1, node2, breakpoint_a, breakpoint_b
);
}
node2 = read_next_node(&mut file2, &mut buffer2);
breakpoint_b += node_sizes2.get(&node2).unwrap().clone();
}
Expand All @@ -120,11 +135,12 @@ pub fn distance(
}
}
println!(
"# Distance: {} (E={}, S={}, M={}).",
"# Distance: {} (E={}, S={}, M={}, SP={}).",
splits_count + merges_count,
equivalences_count,
splits_count,
merges_count
merges_count,
spurious_count
);
Ok(())
}
Expand Down
48 changes: 48 additions & 0 deletions src/evaluate_spuriousness.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
use std::collections::HashMap;
use std::fs::File;
use std::io::{self, BufRead, BufReader};
// We want to check if some positions are spurious breakpoints.
// A spurious breakpoint is a position where the node before has a single outgoing edge and the node after has a single incoming edge.
// We can check this by computing the number of preceding and following neigbors of each node.
// Here, we will list nodes ID per path that has spuriousness issues
// We can then iterate in the next step over the path and check if nodes identified as spurious breakpoints are in the path.
// If they are in, we can get the position in the path, which will gives us a series of spurious breakpoints per path

pub fn spurious_breakpoints(file_path: &str) -> io::Result<Vec<String>> {
/*
Given a file path, this function reads the GFA file and returns a HashMap:
- spurious_nodes: a vector of spurious node IDs as values
*/
let file: File = File::open(file_path)?;
let mut reader: BufReader<File> = BufReader::new(file);
let mut seq_successors: HashMap<String, Vec<String>> = HashMap::new();

let mut line: String = String::new();
while reader.read_line(&mut line)? > 0 {
let columns: Vec<&str> = line.split('\t').collect();
if let Some(first_char) = line.chars().next() {
if first_char == 'E' {
// In the case of an E-line, we store the predecessor and successor nodes
let successor: String = String::from(columns[3]);

if seq_successors.contains_key(&successor) {
seq_successors
.get_mut(&successor)
.unwrap()
.push(successor.clone());
} else {
seq_successors.insert(successor.clone(), vec![successor.clone()]);
}
}
line.clear(); // Clear the line buffer for the next read
}
}
// We then search for nodes that have only one predecessor and we return them
let mut spurious_nodes: Vec<String> = Vec::new();
for (node, successors) in seq_successors.iter() {
if successors.len() == 1 {
spurious_nodes.push(node.clone());
}
}
Ok(spurious_nodes)
}
24 changes: 23 additions & 1 deletion src/main.rs
Original file line number Diff line number Diff line change
@@ -1,11 +1,12 @@
mod compute_distance;
mod evaluate_spuriousness;
mod index_gfa_file;

use clap::Parser;

#[derive(Parser, Debug)]
#[command(
version = "v0.1.2",
version = "v0.1.3",
about = "GFA graph comparison tool",
long_about = "Compares pangenome graphs by calculating the segmentation distance between two GFA (Graphical Fragment Assembly) files."
)]
Expand All @@ -14,6 +15,9 @@ struct Cli {
file_path_a: String,
/// The path to the second GFA file
file_path_b: String,
/// Checks for spurious breakpoints in graphs
#[clap(long = "spurious", short = 's', action)]
spurious: bool,
}

fn main() {
Expand Down Expand Up @@ -58,6 +62,22 @@ fn main() {
std::process::exit(1);
}

let spurious_nodes_a: Vec<String>;
let spurious_nodes_b: Vec<String>;

if args.spurious {
// Check for spurious breakpoints in the first graph
spurious_nodes_a = evaluate_spuriousness::spurious_breakpoints(&args.file_path_a).unwrap();

// Check for spurious breakpoints in the second graph
spurious_nodes_b = evaluate_spuriousness::spurious_breakpoints(&args.file_path_b).unwrap();
} else {
// If the spurious option is not given, do not check for spurious breakpoints
// Init empty vectors
spurious_nodes_a = Vec::new();
spurious_nodes_b = Vec::new();
}

// Compute the distance between the two graphs
compute_distance::distance(
&args.file_path_a,
Expand All @@ -68,6 +88,8 @@ fn main() {
path_descriptors_b,
path_lengths_a,
path_lengths_b,
spurious_nodes_a,
spurious_nodes_b,
)
.unwrap();
}

0 comments on commit dd78205

Please sign in to comment.