diff --git a/.gitignore b/.gitignore index ce108c5..cadeea2 100644 --- a/.gitignore +++ b/.gitignore @@ -2,6 +2,7 @@ /venv /tmp /run_1k +/benchmark traces_debug.csv pseudoscans_debug.json diff --git a/README.md b/README.md index 1ac93b7..2f545c8 100644 --- a/README.md +++ b/README.md @@ -1,5 +1,5 @@ -# Ionmesh +# IonMesh ## General functioning @@ -49,6 +49,17 @@ ims_scaling = 0.02 min_n = 4 min_neighbor_intensity = 500 +[sage_search_config] +static_mods = [[ + "C", + 57.0215, +]] +variable_mods = [[ + "M", + [15.9949], +]] +fasta_path = "./tmp/UP000005640_9606.fasta" + [output_config] # These options can be missing, if missing will not output the files. out_features_csv = "features.csv" debug_traces_csv = "debug_traces.csv" @@ -70,12 +81,23 @@ DEBUG_TRACES_FROM_CACHE=1 # If set and non empty will load the traces from the c ## Roadmap 1. Use aggregation metrics to re-score sage search. -2. Do a two pass speudospec generation, where the first pass finds the centroids and the second pass aggregates around a radius. (this will prevent the issue where common ions, like b2's are assigned only to the most intense spectrum in a window....) +2. [In progress] Do a two pass speudospec generation, where the first pass finds the centroids and the second pass aggregates around a radius. (this will prevent the issue where common ions, like b2's are assigned only to the most intense spectrum in a window....) - RN I believe it is over-aggregating peaks and leading to a lot of straggler peaks. 3. Re-define rt parmeters in the config as a function of the cycle time and not raw seconds. + - This can help with the observed fact that params perform very differently on instrument/method variants. (Some reparametrization could be % of frame intensity vs absolute intensity... Number of cycles instead of retention time... + Use more actively the intensity of the cluster rather than the number of neighbors...) 4. Add targeted extraction. 5. Add detection of MS1 features + notched search instead of wide window search. 6. Clean up some of the features and decide what aggregation steps use interal paralellism. (in some steps making multiple aggregations in paralle is better than doing parallel operations within the aggregation). + - Fix nomenclature ... I dont like how it is not consistent (indexed, indexer, index are using interchangeably ...). +7. Compilation warning cleanup. +8. Clean up dead/commented out code. +9. Refactor `max_extension_distances` argument in the generic dbscan implementation to prevent the errors that might arise from mixing up the dimensions. + - Should that be a propoerty of the converter? +10. Commit to f32/f64 in specific places ... instead of the harder to maintain generic types. +11. Add CICD to distribute the pre-compiled binaries. +12. Add semver checks to the CICD pipeline. +13. Add IMS output to the sage report. ## Maybe in the roadmap @@ -85,8 +107,4 @@ DEBUG_TRACES_FROM_CACHE=1 # If set and non empty will load the traces from the c ## Where are we at? -- Ids are not great ... They do seem good via manual inspection but the number of ids is low. - - - - +- Ids are pretty close to the equivalent DDA runs with the correct parameters ... They do seem good via manual inspection but the number of ids is low compared to peptide-centric searches. diff --git a/src/aggregation/chromatograms.rs b/src/aggregation/chromatograms.rs index 05d56ff..ecd21bd 100644 --- a/src/aggregation/chromatograms.rs +++ b/src/aggregation/chromatograms.rs @@ -1,7 +1,7 @@ use log::warn; use num_traits::AsPrimitive; -use rerun::external::arrow2::io::print; + use std::collections::BTreeMap; use std::ops::{AddAssign, Mul}; @@ -24,9 +24,9 @@ pub struct ChromatogramArray + AddAssign + Default + AsPrimi impl BTreeChromatogram { /// Create a new BTreeChromatogram with the given RT binsize and bin offset. - /// + /// /// The values in bin = 0 will be in the range [bin_offset, bin_offset + binsize) - /// + /// pub fn new(rt_binsize: f32, rt_bin_offset: f32) -> Self { BTreeChromatogram { btree: BTreeMap::new(), @@ -81,7 +81,7 @@ impl BTreeChromatogram { match min { Some(min) => { let max = *self.btree.keys().last().unwrap(); - Some((*min as i32, max as i32)) + Some((*min, max)) } None => None, } @@ -150,10 +150,10 @@ impl BTreeChromatogram { // The chromatogram uses the bin size of the chromatogram btree // but re-centers it to the mean RT of the trace - if self.btree.len() > 0 { + if !self.btree.is_empty() { let int_center = ((center_rt.unwrap_or(0.) - self.rt_bin_offset.unwrap()) / self.rt_binsize) as i32; let left_start = int_center - (NUM_LOCAL_CHROMATOGRAM_BINS / 2) as i32; - + for i in 0..NUM_LOCAL_CHROMATOGRAM_BINS { let bin = left_start + i as i32; chromatogram_arr[i] = *self.btree.get(&bin).unwrap_or(&0) as f32; @@ -223,7 +223,7 @@ mod chromatogram_tests { assert_eq!(c.get_bin(&0), Some(&1)); assert!(c.get_bin(&-1).is_none()); let neg_one_rt = c.bin_to_rt(-1); - assert!(neg_one_rt >= -1.01 && neg_one_rt <= -0.99); + assert!((-1.01..=-0.99).contains(&neg_one_rt)); let mut c2 = BTreeChromatogram::new(1., 0.); c2.add(0., 1); @@ -277,7 +277,7 @@ mod chromatogram_tests { c.chromatogram[4] = 20; let cosine = c.cosine_similarity(&c2).unwrap(); assert!(cosine <= 0.9, "Cosine: {}", cosine); - + } #[test] diff --git a/src/aggregation/dbscan.rs b/src/aggregation/dbscan.rs index 5009d07..feab118 100644 --- a/src/aggregation/dbscan.rs +++ b/src/aggregation/dbscan.rs @@ -17,7 +17,7 @@ use crate::mod_types::Float; use crate::ms::frames; use crate::space::space_generics::{HasIntensity, IndexedPoints, NDPoint}; use indicatif::ProgressIterator; -use log::{debug, info}; +use log::{debug, info, trace}; use rayon::prelude::*; @@ -168,6 +168,7 @@ fn _dbscan< filter_fun: Option, converter: C, progress: bool, + max_extension_distances: &[Float;N], ) -> (u64, Vec>) { let mut initial_candidates_counts = utils::RollingSDCalculator::default(); let mut final_candidates_counts = utils::RollingSDCalculator::default(); @@ -254,7 +255,6 @@ fn _dbscan< let mut seed_set: Vec<&usize> = Vec::new(); seed_set.extend(neighbors); - const MAX_EXTENSION_DISTANCE: Float = 5.; let mut internal_neighbor_additions = 0; while let Some(neighbor) = seed_set.pop() { @@ -304,9 +304,15 @@ fn _dbscan< let p = &quad_points[**i]; let query_point = query_elems.1.unwrap(); // Using minkowski distance with p = 1, manhattan distance. - let dist = (p.values[0] - query_point.values[0]).abs() - + (p.values[1] - query_point.values[1]).abs(); - let within_distance = dist <= MAX_EXTENSION_DISTANCE; + let mut within_distance = true; + for ((p, q), max_dist) in p.values.iter().zip(query_point.values).zip(max_extension_distances.iter()) { + let dist = (p - q).abs(); + within_distance = within_distance && dist <= *max_dist; + if !within_distance { + break; + } + } + going_downhill && within_distance }); local_neighbor_filter_timer.stop(false); @@ -435,8 +441,8 @@ pub fn aggregate_clusters< >( tot_clusters: u64, cluster_labels: Vec>, - elements: Vec, - def_aggregator: F, + elements: &[T], + def_aggregator: &F, log_level: utils::LogLevel, keep_unclustered: bool, ) -> Vec { @@ -552,11 +558,68 @@ pub fn aggregate_clusters< out } +// Pretty simple function ... it uses every passed centroid, converts it to a point +// and generates a new centroid that aggregates all the points in its range. +// In contrast with the dbscan method, the elements in each cluster are not necessarily +// mutually exclusive. +fn reassign_centroid< + 'a, + const N: usize, + T: HasIntensity + Send + Clone + Copy, + C: NDPointConverter, + I: IndexedPoints<'a, N, usize> + std::marker::Sync, + G: Sync + Send + ClusterAggregator, + R: Send, + F: Fn() -> G + Send + Sync, + Z: AsPrimitive + + Send + + Sync + + Add + + PartialOrd + + Div + + Mul + + Default + + Sub, +>( + centroids: Vec, + indexed_points: &'a I, + centroid_converter: C, + elements: &Vec, + def_aggregator: F, + log_level: utils::LogLevel, + expansion_factors: &[Float;N], +) -> Vec { + let mut timer = utils::ContextTimer::new("reassign_centroid", true, log_level); + let mut out = Vec::with_capacity(centroids.len()); + + for centroid in centroids { + let query_point = centroid_converter.convert(¢roid); + let mut query_elems = centroid_converter.convert_to_bounds_query(&query_point); + query_elems.0.expand(expansion_factors); + + // trace!("Querying for Centroid: {:?}", query_elems.1); + // trace!("Querying for Boundary: {:?}", query_elems.0); + let neighbors = indexed_points.query_ndrange(&query_elems.0, query_elems.1); + // trace!("Found {} neighbors", neighbors.len()); + let mut aggregator = def_aggregator(); + let mut num_agg = 0; + for neighbor in neighbors { + aggregator.add(&elements[*neighbor]); + num_agg += 1; + } + trace!("Aggregated {} elements", num_agg); + out.push(aggregator.aggregate()); + } + + timer.stop(true); + out +} // TODO: rename prefiltered peaks argument! // TODO implement a version that takes a sparse distance matrix. pub fn dbscan_generic< C: NDPointConverter, + C2: NDPointConverter, R: Send, G: Sync + Send + ClusterAggregator, T: HasIntensity + Send + Clone + Copy + Sync, @@ -582,6 +645,8 @@ pub fn dbscan_generic< extra_filter_fun: Option<&FF>, log_level: Option, keep_unclustered: bool, + max_extension_distances: &[Float;N], + back_converter: Option, ) -> Vec { let show_progress = log_level.is_some(); let log_level = match log_level { @@ -625,17 +690,46 @@ pub fn dbscan_generic< extra_filter_fun, converter, show_progress, + max_extension_distances, ); i_timer.stop(true); - aggregate_clusters( + let centroids = aggregate_clusters( tot_clusters, cluster_labels, - prefiltered_peaks, - def_aggregator, + &prefiltered_peaks, + &def_aggregator, log_level, keep_unclustered, - ) + ); + + match back_converter { + Some(bc) => { + + reassign_centroid( + centroids, + &tree, + bc, + &prefiltered_peaks, + &def_aggregator, + log_level, + max_extension_distances, + ) + } + None => { + centroids + } + } +} + +// https://github.com/rust-lang/rust/issues/35121 +// The never type is not stable yet.... +struct BypassDenseFrameBackConverter {} + +impl NDPointConverter for BypassDenseFrameBackConverter { + fn convert(&self, _elem: &frames::TimsPeak) -> NDPoint<2> { + panic!("This should never be called") + } } struct DenseFrameConverter { @@ -658,8 +752,10 @@ type FFTimsPeak = fn(&TimsPeak, &TimsPeak) -> bool; // bool> pub fn dbscan_denseframes( mut denseframe: frames::DenseFrame, - mz_scaling: &f64, - ims_scaling: &f32, + mz_scaling: f64, + max_mz_extension: f64, + ims_scaling: f32, + max_ims_extension: f32, min_n: usize, min_intensity: u64, ) -> frames::DenseFrame { @@ -673,7 +769,7 @@ pub fn dbscan_denseframes( let keep_vector = within_distance_apply( &denseframe.raw_peaks, &|peak| peak.mz, - mz_scaling, + &mz_scaling, &|i_right, i_left| (i_right - i_left) >= min_n, ); @@ -690,8 +786,8 @@ pub fn dbscan_denseframes( }; let converter = DenseFrameConverter { - mz_scaling: *mz_scaling, - ims_scaling: *ims_scaling, + mz_scaling, + ims_scaling, }; let peak_vec: Vec = dbscan_generic( converter, @@ -702,6 +798,8 @@ pub fn dbscan_denseframes( None::<&FFTimsPeak>, None, true, + &[max_mz_extension as Float, max_ims_extension as Float], + None::, ); frames::DenseFrame { diff --git a/src/aggregation/ms_denoise.rs b/src/aggregation/ms_denoise.rs index c4e280f..be71368 100644 --- a/src/aggregation/ms_denoise.rs +++ b/src/aggregation/ms_denoise.rs @@ -13,6 +13,35 @@ use indicatif::ParallelProgressIterator; use log::{info, trace, warn}; use rayon::prelude::*; use timsrust::Frame; +use serde::{Deserialize, Serialize}; + +// TODO I can probably split the ms1 and ms2 ... +#[derive(Debug, Serialize, Deserialize, Clone, Copy)] +pub struct DenoiseConfig { + pub mz_scaling: f32, + pub ims_scaling: f32, + pub max_mz_expansion_ratio: f32, + pub max_ims_expansion_ratio: f32, + pub ms2_min_n: u8, + pub ms1_min_n: u8, + pub ms1_min_cluster_intensity: u32, + pub ms2_min_cluster_intensity: u32, +} + +impl Default for DenoiseConfig { + fn default() -> Self { + DenoiseConfig { + mz_scaling: 0.015, + ims_scaling: 0.015, + max_mz_expansion_ratio: 1., + max_ims_expansion_ratio: 4., + ms2_min_n: 5, + ms1_min_n: 10, + ms1_min_cluster_intensity: 100, + ms2_min_cluster_intensity: 100, + } + } +} #[derive(Debug, Clone, Copy)] struct FrameStats { @@ -78,16 +107,25 @@ fn _denoise_denseframe( frame: DenseFrame, min_n: usize, min_intensity: u64, - mz_scaling: &f64, - ims_scaling: &f32, + mz_scaling: f64, + max_mz_extension: f64, + ims_scaling: f32, + max_ims_extension: f32, ) -> DenseFrame { // I am 99% sure the compiler will remove this section when optimizing ... but I still need to test it. let frame_stats_start = FrameStats::new(&frame); let index = frame.index; // this is the line that matters // TODO move the scalings to parameters - let denoised_frame = - dbscan::dbscan_denseframes(frame, mz_scaling, ims_scaling, min_n, min_intensity); + let denoised_frame = dbscan::dbscan_denseframes( + frame, + mz_scaling, + max_mz_extension, + ims_scaling, + max_ims_extension, + min_n, + min_intensity, + ); let frame_stats_end = FrameStats::new(&denoised_frame); if cfg!(debug_assertions) { @@ -104,8 +142,10 @@ fn _denoise_dia_frame( dia_frame_info: &DIAFrameInfo, ims_converter: &timsrust::Scan2ImConverter, mz_converter: &timsrust::Tof2MzConverter, - mz_scaling: &f64, - ims_scaling: &f32, + mz_scaling: f64, + max_mz_extension: f64, + ims_scaling: f32, + max_ims_extension: f32, ) -> Vec { let frame_windows = dia_frame_info .split_frame(frame) @@ -125,7 +165,9 @@ fn _denoise_dia_frame( min_n, min_intensity, mz_scaling, + max_mz_extension, ims_scaling, + max_ims_extension, ); DenseFrameWindow { @@ -223,6 +265,8 @@ struct FrameDenoiser { min_intensity: u64, mz_scaling: f64, ims_scaling: f32, + max_mz_extension: f64, + max_ims_extension: f32, ims_converter: timsrust::Scan2ImConverter, mz_converter: timsrust::Tof2MzConverter, } @@ -234,8 +278,10 @@ impl<'a> Denoiser<'a, Frame, DenseFrame, Converters, Option> for FrameDen denseframe, self.min_n, self.min_intensity, - &self.mz_scaling, - &self.ims_scaling, + self.mz_scaling, + self.max_mz_extension, + self.ims_scaling, + self.max_ims_extension, ) } } @@ -244,7 +290,9 @@ struct DIAFrameDenoiser { min_n: usize, min_intensity: u64, mz_scaling: f64, + max_mz_extension: f64, ims_scaling: f32, + max_ims_extension: f32, dia_frame_info: DIAFrameInfo, ims_converter: timsrust::Scan2ImConverter, mz_converter: timsrust::Tof2MzConverter, @@ -261,18 +309,24 @@ impl<'a> Denoiser<'a, Frame, Vec, Converters, Option> &self.dia_frame_info, &self.ims_converter, &self.mz_converter, - &self.mz_scaling, - &self.ims_scaling, + self.mz_scaling, + self.max_mz_extension, + self.ims_scaling, + self.max_ims_extension, ) } } + +// RN this is dead but will be resurrected soon ... pub fn read_all_ms1_denoising( path: String, min_intensity: u64, min_n: usize, mz_scaling: f64, + max_mz_extension: f64, ims_scaling: f32, + max_ims_extension: f32, record_stream: &mut Option, ) -> Vec { let reader = timsrust::FileReader::new(path).unwrap(); @@ -295,7 +349,9 @@ pub fn read_all_ms1_denoising( let ms1_denoiser = FrameDenoiser { min_n, mz_scaling, + max_mz_extension, ims_scaling, + max_ims_extension, min_intensity, ims_converter, mz_converter, @@ -310,12 +366,10 @@ pub fn read_all_ms1_denoising( } // This could probably be a macro ... +// Maybe I should just pass the config ... instead of the elements pub fn read_all_dia_denoising( path: String, - min_n: usize, - min_intensity: u64, - mz_scaling: f64, - ims_scaling: f32, + config: DenoiseConfig, record_stream: &mut Option, ) -> (Vec, DIAFrameInfo) { let mut timer = utils::ContextTimer::new("Reading all DIA frames", true, utils::LogLevel::INFO); @@ -328,27 +382,18 @@ pub fn read_all_dia_denoising( let mz_converter = reader.get_tof_converter().unwrap(); timer.stop(true); - // frames = frames - // .into_iter() - // .filter(|frame| match frame.frame_type { - // timsrust::FrameType::MS2(timsrust::AcquisitionType::DIAPASEF) => true, - // _ => false, - // }) - // .collect(); - frames.retain(|frame| match frame.frame_type { timsrust::FrameType::MS2(timsrust::AcquisitionType::DIAPASEF) => true, _ => false, }); - // let min_intensity = 50u64; - // let min_n: usize = 2; - let denoiser = DIAFrameDenoiser { - min_n, - min_intensity, - mz_scaling, - ims_scaling, + min_n: config.ms2_min_n.into(), + min_intensity: config.ms2_min_cluster_intensity.into(), + mz_scaling: config.mz_scaling.into(), + max_mz_extension: config.max_mz_expansion_ratio.into(), + ims_scaling: config.ims_scaling, + max_ims_extension: config.max_ims_expansion_ratio, dia_frame_info: dia_info.clone(), ims_converter, mz_converter, diff --git a/src/aggregation/tracing.rs b/src/aggregation/tracing.rs index 3debc08..14ac700 100644 --- a/src/aggregation/tracing.rs +++ b/src/aggregation/tracing.rs @@ -13,12 +13,39 @@ use rayon::iter::IntoParallelIterator; use rayon::prelude::*; use serde::{Deserialize, Serialize}; use serde::ser::{Serializer, SerializeStruct}; +use core::panic; use std::error::Error; use std::io::Write; use std::path::Path; type QuadLowHigh = (f64, f64); +#[derive(Debug, Serialize, Deserialize, Clone, Copy)] +pub struct TracingConfig { + pub mz_scaling: f32, + pub rt_scaling: f32, + pub ims_scaling: f32, + pub max_mz_expansion_ratio: f32, + pub max_rt_expansion_ratio: f32, + pub max_ims_expansion_ratio: f32, + pub min_n: u8, + pub min_neighbor_intensity: u32, +} + +impl Default for TracingConfig { + fn default() -> Self { + TracingConfig { + mz_scaling: 0.015, + rt_scaling: 2.4, + ims_scaling: 0.02, + max_mz_expansion_ratio: 1., + max_rt_expansion_ratio: 1.5, + max_ims_expansion_ratio: 4., + min_n: 3, + min_neighbor_intensity: 450, + } + } +} // Serialize #[derive(Debug, Clone, Copy)] @@ -176,14 +203,15 @@ impl TraceLike for BaseTrace { pub fn combine_traces( denseframe_windows: Vec, - mz_scaling: f64, - rt_scaling: f64, - ims_scaling: f64, - min_n: usize, - min_intensity: u32, + config: TracingConfig, rt_binsize: f32, record_stream: &mut Option, ) -> Vec { + // mz_scaling: f64, + // rt_scaling: f64, + // ims_scaling: f64, + // min_n: usize, + // min_intensity: u32, // Grouping by quad windows + group id let mut timer = utils::ContextTimer::new("Tracing peaks in time", true, utils::LogLevel::INFO); @@ -226,11 +254,14 @@ pub fn combine_traces( .map(|x| { _combine_single_window_traces( x, - mz_scaling, - rt_scaling, - ims_scaling, - min_n, - min_intensity, + config.mz_scaling.into(), + config.max_mz_expansion_ratio, + config.rt_scaling.into(), + config.max_rt_expansion_ratio, + config.ims_scaling.into(), + config.max_ims_expansion_ratio, + config.min_n.into(), + config.min_neighbor_intensity, rt_binsize, ) }) @@ -426,6 +457,15 @@ impl NDPointConverter for TimeTimsPeakConverter { } } +struct BypassBaseTraceBackConverter {} + +impl NDPointConverter for BypassBaseTraceBackConverter { + fn convert(&self, _elem: &BaseTrace) -> NDPoint<3> { + panic!("This should never be called"); + } +} + + fn _flatten_denseframe_vec(denseframe_windows: Vec) -> Vec { denseframe_windows .into_iter() @@ -449,17 +489,22 @@ fn _flatten_denseframe_vec(denseframe_windows: Vec) -> Vec bool; + +// TODO maybe this can be a builder-> executor pattern fn _combine_single_window_traces( prefiltered_peaks: Vec, mz_scaling: f64, + max_mz_expansion_ratio: f32, rt_scaling: f64, + max_rt_expansion_ratio: f32, ims_scaling: f64, + max_ims_expansion_ratio: f32, min_n: usize, min_intensity: u32, rt_binsize: f32, ) -> Vec { debug!("Prefiltered peaks: {}", prefiltered_peaks.len()); - let converter = TimeTimsPeakConverter { + let converter: TimeTimsPeakConverter = TimeTimsPeakConverter { mz_scaling, rt_scaling, ims_scaling, @@ -468,10 +513,15 @@ fn _combine_single_window_traces( prefiltered_peaks[0].quad_low_high.0, prefiltered_peaks[0].quad_low_high.1, ); + let max_extension_distances: [f32; 3] = [ + max_mz_expansion_ratio, + max_rt_expansion_ratio, + max_ims_expansion_ratio, + ]; warn!("Assuming all quad windows are the same!!! (fine for diaPASEF)"); // TODO make dbscan_generic a runner-class - let foo: Vec = dbscan_generic( + let out_traces: Vec = dbscan_generic( converter, prefiltered_peaks, min_n, @@ -484,15 +534,17 @@ fn _combine_single_window_traces( num_peaks: 0, num_rt_peaks: 0, quad_low_high: window_quad_low_high, - btree_chromatogram: BTreeChromatogram::new_lazy(rt_binsize.clone()), + btree_chromatogram: BTreeChromatogram::new_lazy(rt_binsize), }, None::<&FFTimeTimsPeak>, None, false, + &max_extension_distances, + None::, ); - debug!("Combined traces: {}", foo.len()); - foo + debug!("Combined traces: {}", out_traces.len()); + out_traces } // NOW ... combine traces into pseudospectra @@ -603,10 +655,8 @@ impl<'a> ClusterAggregator for PseudoSpectrumAggregat struct BaseTraceConverter { rt_scaling: f64, - // rt_start_end_ratio: f64, ims_scaling: f64, quad_scaling: f64, - // peak_width_prior: f64, } impl NDPointConverter for BaseTraceConverter { @@ -652,22 +702,69 @@ impl NDPointConverter for BaseTraceConverter { } } -pub fn combine_pseudospectra( - traces: Vec, +struct PseudoScanBackConverter { rt_scaling: f64, ims_scaling: f64, quad_scaling: f64, - min_intensity: u32, - min_n: usize, +} + +impl NDPointConverter for PseudoScanBackConverter { + fn convert(&self, elem: &PseudoSpectrum) -> NDPoint<3> { + let quad_mid = (elem.quad_low + elem.quad_high) / 2.; + NDPoint { + values: [ + (elem.rt as f64 / self.rt_scaling) as Float, + (elem.ims as f64 / self.ims_scaling) as Float, + (quad_mid as f64 / self.quad_scaling) as Float, + ], + } + } +} + + + + +#[derive(Debug, Serialize, Deserialize, Clone, Copy)] +pub struct PseudoscanGenerationConfig { + pub rt_scaling: f32, + pub quad_scaling: f32, + pub ims_scaling: f32, + pub max_rt_expansion_ratio: f32, + pub max_quad_expansion_ratio: f32, + pub max_ims_expansion_ratio: f32, + pub min_n: u8, + pub min_neighbor_intensity: u32, +} + +impl Default for PseudoscanGenerationConfig { + fn default() -> Self { + PseudoscanGenerationConfig { + rt_scaling: 2.4, + quad_scaling: 5., + ims_scaling: 0.015, + max_rt_expansion_ratio: 5., + max_quad_expansion_ratio: 1., + max_ims_expansion_ratio: 2., + min_n: 6, + min_neighbor_intensity: 6000, + } + } +} + + +pub fn combine_pseudospectra( + traces: Vec, + config: PseudoscanGenerationConfig, record_stream: &mut Option, ) -> Vec { let mut timer = utils::ContextTimer::new("Combining pseudospectra", true, utils::LogLevel::INFO); let converter = BaseTraceConverter { - rt_scaling, - ims_scaling, - quad_scaling, + rt_scaling: config.rt_scaling.into(), + ims_scaling: config.ims_scaling.into(), + quad_scaling: config.quad_scaling.into(), + // rt_start_end_ratio: 2., // peak_width_prior: 0.75, }; @@ -688,15 +785,29 @@ pub fn combine_pseudospectra( within_iou_tolerance && within_cosine_tolerance }; + + let back_converter = PseudoScanBackConverter { + rt_scaling: config.rt_scaling.into(), + ims_scaling: config.ims_scaling.into(), + quad_scaling: config.quad_scaling.into(), + }; + let max_extension_distances: [Float; 3] = [ + config.max_rt_expansion_ratio as Float, + config.max_ims_expansion_ratio as Float, + config.max_quad_expansion_ratio as Float, + ]; + let foo: Vec = dbscan_generic( converter, traces, - min_n, - min_intensity.into(), + config.min_n.into(), + config.min_neighbor_intensity.into(), PseudoSpectrumAggregator::default, Some(&extra_filter_fun), Some(utils::LogLevel::INFO), false, + &max_extension_distances, + Some(back_converter), ); info!("Combined pseudospectra: {}", foo.len()); diff --git a/src/main.rs b/src/main.rs index c11f244..074e09a 100644 --- a/src/main.rs +++ b/src/main.rs @@ -24,16 +24,16 @@ use clap::Parser; use crate::scoring::SageSearchConfig; use serde::{Deserialize, Serialize}; +use std::env; use std::fs; use std::path::Path; -use std::env; #[derive(Parser, Debug)] #[command(author, version, about, long_about = None)] struct Args { #[arg(short, long)] config: String, - #[arg(short, long, default_value = "peakachu_output")] + #[arg(short, long, default_value = "ionmesh_output")] output_dir: String, #[arg(long, action)] write_template: bool, @@ -41,75 +41,9 @@ struct Args { files: Vec, } -#[derive(Debug, Serialize, Deserialize, Clone, Copy)] -struct TracingConfig { - mz_scaling: f32, - rt_scaling: f32, - ims_scaling: f32, - min_n: u8, - min_neighbor_intensity: u32, -} - -impl Default for TracingConfig { - fn default() -> Self { - TracingConfig { - mz_scaling: 0.02, - rt_scaling: 2.2, - ims_scaling: 0.02, - min_n: 2, - min_neighbor_intensity: 200, - } - } -} - -#[derive(Debug, Serialize, Deserialize, Clone, Copy)] -struct DenoiseConfig { - mz_scaling: f32, - ims_scaling: f32, - ms2_min_n: u8, - ms1_min_n: u8, - ms1_min_cluster_intensity: u32, - ms2_min_cluster_intensity: u32, -} - -impl Default for DenoiseConfig { - fn default() -> Self { - DenoiseConfig { - mz_scaling: 0.02, - ims_scaling: 0.02, - ms2_min_n: 2, - ms1_min_n: 3, - ms1_min_cluster_intensity: 100, - ms2_min_cluster_intensity: 100, - } - } -} - -#[derive(Debug, Serialize, Deserialize, Clone, Copy)] -struct PseudoscanGenerationConfig { - rt_scaling: f32, - quad_scaling: f32, - ims_scaling: f32, - min_n: u8, - min_neighbor_intensity: u32, -} - -impl Default for PseudoscanGenerationConfig { - fn default() -> Self { - PseudoscanGenerationConfig { - rt_scaling: 2.2, - quad_scaling: 5., - ims_scaling: 0.02, - min_n: 5, - min_neighbor_intensity: 200, - } - } -} - - #[derive(Debug, Serialize, Deserialize, Clone)] struct OutputConfig { - // + // debug_scans_json: Option, debug_traces_csv: Option, out_features_csv: Option, @@ -127,14 +61,13 @@ impl Default for OutputConfig { #[derive(Debug, Default, Serialize, Deserialize, Clone)] struct Config { - denoise_config: DenoiseConfig, - tracing_config: TracingConfig, - pseudoscan_generation_config: PseudoscanGenerationConfig, + denoise_config: aggregation::ms_denoise::DenoiseConfig, + tracing_config: aggregation::tracing::TracingConfig, + pseudoscan_generation_config: aggregation::tracing::PseudoscanGenerationConfig, sage_search_config: SageSearchConfig, output_config: OutputConfig, } - impl Config { fn from_toml(path: String) -> Result> { let config_str = std::fs::read_to_string(path)?; @@ -184,18 +117,9 @@ fn main() { } // TODO: consier moving this to the config struct as an implementation. - let out_path_scans = match config.output_config.debug_scans_json { - Some(ref path) => Some(Path::new(path).to_path_buf()), - None => None, - }; - let out_traces_path = match config.output_config.debug_traces_csv { - Some(ref path) => Some(Path::new(path).to_path_buf()), - None => None, - }; - let out_path_features = match config.output_config.out_features_csv { - Some(ref path) => Some(Path::new(path).to_path_buf()), - None => None, - }; + let out_path_scans = config.output_config.debug_scans_json.as_ref().map(|path| out_path_dir.join(path).to_path_buf()); + let out_traces_path = config.output_config.debug_traces_csv.as_ref().map(|path| out_path_dir.join(path).to_path_buf()); + let out_path_features = config.output_config.out_features_csv.as_ref().map(|path| out_path_dir.join(path).to_path_buf()); let mut traces_from_cache = env::var("DEBUG_TRACES_FROM_CACHE").is_ok(); if traces_from_cache && out_path_scans.is_none() { @@ -210,30 +134,22 @@ fn main() { log::info!("Reading DIA data from: {}", path_use); let (dia_frames, dia_info) = aggregation::ms_denoise::read_all_dia_denoising( path_use.clone(), - config.denoise_config.ms2_min_n.into(), - config.denoise_config.ms2_min_cluster_intensity.into(), - config.denoise_config.mz_scaling.into(), - config.denoise_config.ims_scaling, + config.denoise_config, &mut rec, ); let cycle_time = dia_info.calculate_cycle_time(); + // TODO add here expansion limits let mut traces = aggregation::tracing::combine_traces( dia_frames, - config.tracing_config.mz_scaling.into(), - config.tracing_config.rt_scaling.into(), - config.tracing_config.ims_scaling.into(), - config.tracing_config.min_n.into(), - config.tracing_config.min_neighbor_intensity, - cycle_time as f32, + config.tracing_config, + cycle_time, &mut rec, ); let out = match out_traces_path { - Some(out_path) => { - aggregation::tracing::write_trace_csv(&traces, out_path) - } + Some(out_path) => aggregation::tracing::write_trace_csv(&traces, out_path), None => Ok(()), }; match out { @@ -245,22 +161,21 @@ fn main() { println!("traces: {:?}", traces.len()); traces.retain(|x| x.num_agg > 5); - traces.retain(|x| x.num_rt_points >= 2); - println!("traces: {:?}", traces[traces.len()-5]); println!("traces: {:?}", traces.len()); + if traces.len() > 5 { + println!("sample_trace: {:?}", traces[traces.len() - 4]) + } // Maybe reparametrize as 1.1 cycle time + // TODO add here expansion limits let pseudoscans = aggregation::tracing::combine_pseudospectra( traces, - config.pseudoscan_generation_config.rt_scaling.into(), - config.pseudoscan_generation_config.ims_scaling.into(), - config.pseudoscan_generation_config.quad_scaling.into(), - config.pseudoscan_generation_config.min_neighbor_intensity, - config.pseudoscan_generation_config.min_n.into(), + config.pseudoscan_generation_config, &mut rec, ); // Report min/max/average/std and skew for ims and rt + // This can probably be a macro ... let ims_stats = utils::get_stats(&pseudoscans.iter().map(|x| x.ims as f64).collect::>()); let ims_sd_stats = utils::get_stats( @@ -293,9 +208,7 @@ fn main() { println!("npeaks: {:?}", npeaks); let out = match out_path_scans { - Some(out_path) => { - aggregation::tracing::write_pseudoscans_json(&pseudoscans, out_path) - } + Some(out_path) => aggregation::tracing::write_pseudoscans_json(&pseudoscans, out_path), None => Ok(()), }; @@ -315,6 +228,7 @@ fn main() { pseudoscans, config.sage_search_config, out_path_features.clone(), + 1, ); match score_out { Ok(_) => {} diff --git a/src/ms/frames.rs b/src/ms/frames.rs index ad12e34..91fe0f8 100644 --- a/src/ms/frames.rs +++ b/src/ms/frames.rs @@ -1,4 +1,4 @@ -use serde::de; + pub use timsrust::Frame; pub use timsrust::FrameType; pub use timsrust::{ @@ -42,7 +42,7 @@ pub enum SortingOrder { } /// Unprocessed data from a 'Frame' after breaking by quad isolation_window + ims window. -/// +/// /// /// 1. every tof-index + intensity represents a peak. /// 2. Scan offsets are monotonically increasing. @@ -150,7 +150,7 @@ impl DenseFrame { let num_tofs = index_offset - last_scan_offset; let ims = ims_converter.convert(scan_index as u32) as f32; - expanded_scan_indices.extend(vec![ims; num_tofs as usize]); + expanded_scan_indices.extend(vec![ims; num_tofs]); last_scan_offset = *index_offset; } @@ -342,7 +342,7 @@ impl RerunPlottable> for DenseFrame { &rerun::Points2D::new( quad_points .iter() - .map(|point| (point.values[0] as f32, point.values[1] as f32)), + .map(|point| (point.values[0], point.values[1])), ) .with_radii(radii), )?; diff --git a/src/ms/tdf.rs b/src/ms/tdf.rs index 1954d18..a420aca 100644 --- a/src/ms/tdf.rs +++ b/src/ms/tdf.rs @@ -169,8 +169,8 @@ impl DIAFrameInfo { let scan_offsets_use = &frame.scan_offsets[scan_range.scan_start..(scan_range.scan_end - 1)]; let scan_start = scan_offsets_use[0]; - let mz_indptr_start = scan_offsets_use[0] as usize; - let mz_indptr_end = *scan_offsets_use.last().unwrap() as usize; + let mz_indptr_start = scan_offsets_use[0]; + let mz_indptr_end = *scan_offsets_use.last().unwrap(); let tof_indices_keep = frame.tof_indices[mz_indptr_start..mz_indptr_end].to_vec(); let intensities_keep = frame.intensities[mz_indptr_start..mz_indptr_end].to_vec(); @@ -503,7 +503,5 @@ pub fn read_dia_frame_info(dotd_file: String) -> Result { retention_times: DIAFrameInfo::rts_from_tdf_connection(&conn)?, }; - let rt = frame_info.calculate_cycle_time(); - Ok(frame_info) } diff --git a/src/scoring.rs b/src/scoring.rs index 3a64c48..3312ec2 100644 --- a/src/scoring.rs +++ b/src/scoring.rs @@ -174,6 +174,7 @@ pub fn score_pseudospectra( elems: Vec, config: SageSearchConfig, out_path_features: Option, + num_report_psms: usize, ) -> Result, Box> { // 1. Buid raw spectra from the pseudospectra @@ -243,6 +244,11 @@ pub fn score_pseudospectra( // Right now the precursor toleranec should be ignored // bc we are using wide window mode for the search. + let mut chimera = true; + if num_report_psms == 1 { + chimera = false; + } + let precursor_tolerance = Tolerance::Da(-15., 15.); let scorer = Scorer { db: &db, @@ -256,8 +262,8 @@ pub fn score_pseudospectra( max_fragment_charge: Some(3), min_fragment_mass: 100., max_fragment_mass: 4000., - chimera: false, - report_psms: 1, + chimera, + report_psms: num_report_psms, wide_window: true, annotate_matches: false, }; diff --git a/src/space/quad.rs b/src/space/quad.rs index cd51985..89d097e 100644 --- a/src/space/quad.rs +++ b/src/space/quad.rs @@ -75,9 +75,7 @@ impl<'a, T> RadiusQuadTree<'a, T> { // This means any sub-division will be smaller than the radius let query_contained = radius_squared > distance_squared; - if self.points.len() < self.capacity { - self.points.push((point, data)); - } else if query_contained { + if (self.points.len() < self.capacity) || query_contained { self.points.push((point, data)); } else { self.subdivide(); diff --git a/src/space/space_generics.rs b/src/space/space_generics.rs index d9d5db6..dbcb5ee 100644 --- a/src/space/space_generics.rs +++ b/src/space/space_generics.rs @@ -72,6 +72,20 @@ impl NDBoundary { NDBoundary::new(starts, ends) } + + pub fn expand(&mut self, factors: &[Float; D]) { + for (i, ef) in factors.iter().enumerate() { + let mut half_width = self.widths[i] / 2.0; + let center = self.centers[i]; + + half_width *= ef; + + self.starts[i] = center - half_width; + self.ends[i] = center + half_width; + self.widths[i] = self.ends[i]; + self.centers[i] = (self.ends[i] + self.starts[i]) / 2.0; + } + } } // #[derive(Debug, Clone, Copy)] @@ -81,6 +95,7 @@ pub struct NDPoint { pub values: [Float; DIMENSIONALITY], } +// Q: is there any instance where T is not usize? pub trait IndexedPoints<'a, const N: usize, T> { fn query_ndpoint(&'a self, point: &NDPoint) -> Vec<&'a T>; fn query_ndrange( diff --git a/tune/.gitignore b/tune/.gitignore new file mode 100644 index 0000000..ff89e2f --- /dev/null +++ b/tune/.gitignore @@ -0,0 +1,3 @@ + +tune_out*/ +tune.db* diff --git a/tune/requirements.txt b/tune/requirements.txt new file mode 100644 index 0000000..2ba83f9 --- /dev/null +++ b/tune/requirements.txt @@ -0,0 +1,7 @@ +optuna +optuna-dashboard +toml +tomli-w +polars +loguru +matplotlib diff --git a/tune/tune.py b/tune/tune.py new file mode 100644 index 0000000..4f5b7b1 --- /dev/null +++ b/tune/tune.py @@ -0,0 +1,220 @@ +import optuna +import tomli_w +import tomllib +import subprocess +from loguru import logger +import argparse +from pathlib import Path +from dataclasses import dataclass +import random +import polars as pl +import json +from matplotlib import pyplot as plt +import numpy as np +import time +import pprint + +# DEBUG_TRACES_FROM_CACHE + + +@dataclass +class IonMeshTuner: + fasta: Path + base_config: Path + target_file: Path + ionmesh_executable: Path + base_out_dir: Path = Path("tune_out") + + def objective(self, trial: optuna.Trial): + id = trial.number + base_toml = tomllib.load(open(self.base_config, "rb")) + tempdir = self.base_out_dir / f"{id}_{random.randint(0, 1000000)}" + while tempdir.exists(): + tempdir = self.base_out_dir / f"{id}_{random.randint(0, 1000000)}" + + logger.info(f"Running with {tempdir}") + + tempdir.mkdir(exist_ok=True, parents=True) + + # denoise_mz_scale = trial.suggest_float("dn_mz", 0.005, 0.03) + # denoise_ims_scale = trial.suggest_float("dn_ims", 0.005, 0.03) + denoise_min_n = trial.suggest_int("dn_min_n", 2, 10) + # base_toml["denoise_config"]["mz_scaling"] = denoise_mz_scale + # base_toml["denoise_config"]["ims_scaling"] = denoise_ims_scale + base_toml["denoise_config"]["ms2_min_n"] = denoise_min_n + + # tracing_mz_scale = trial.suggest_float("t_mz", 0.01, 0.02) + # tracing_ims_scale = trial.suggest_float("t_ims", 0.01, 0.03) + tracing_rt_scale = trial.suggest_float("t_rt", 0.5, 2.5) + tracing_min_intensity = trial.suggest_int("t_min", 50, 5000) + tracing_min_n = trial.suggest_int("t_min_n", 2, 10) + # base_toml["tracing_config"]["mz_scaling"] = tracing_mz_scale + # base_toml["tracing_config"]["ims_scaling"] = tracing_ims_scale + base_toml["tracing_config"]["rt_scaling"] = tracing_rt_scale + base_toml["tracing_config"]["min_neighbor_intensity"] = tracing_min_intensity + base_toml["tracing_config"]["min_n"] = tracing_min_n + + pseudoscan_rt_scale = trial.suggest_float("ps_rt", 0.5, 2.5) + # pseudoscan_ims_scale = trial.suggest_float("ps_ims", 0.01, 0.03) + pseudoscan_min_intensity = trial.suggest_int("ps_min", 50, 10000) + pseudoscan_min_n = trial.suggest_int("ps_min_n", 2, 10) + base_toml["pseudoscan_generation_config"]["rt_scaling"] = pseudoscan_rt_scale + # base_toml["pseudoscan_generation_config"]["ims_scaling"] = pseudoscan_ims_scale + base_toml["pseudoscan_generation_config"]["min_neighbor_intensity"] = ( + pseudoscan_min_intensity + ) + base_toml["pseudoscan_generation_config"]["min_n"] = pseudoscan_min_n + + base_toml["sage_search_config"]["fasta_path"] = str(self.fasta) + + temp_toml = tempdir / "temp.toml" + tomli_w.dump(base_toml, open(temp_toml, "wb")) + + logger.info(f"Running with config: {pprint.pformat(base_toml)}") + + run_start = time.time() + try: + subprocess.run( + [ + str(self.ionmesh_executable), + "--config", + str(temp_toml), + "--output-dir", + tempdir, + str(self.target_file), + ], + check=True, + ) + except subprocess.CalledProcessError as e: + logger.error(f"Error running IonMesh: {e}") + return float("nan") + + run_end = time.time() + logger.info(f"Run time: {run_end - run_start}") + + try: + delta_discriminant = self.read_results_dir(tempdir) + except pl.exceptions.NoDataError: + logger.error("No data found") + return float("nan") + + return delta_discriminant + + def read_results_dir(self, results_dir): + logger.info(f"Reading results from {results_dir}") + pseudoscan_df = pl.DataFrame( + json.load(open(str(results_dir / "debug_scans.json"))), strict=False + ) + (results_dir / "debug_scans.json").unlink() + + logger.info("Converting scans to parquet") + pseudoscan_df.write_parquet(str(results_dir / "debug_scans.parquet")) + del pseudoscan_df + + # Still pretty big ... I will delete it for now + # pl.scan_csv(str(results_dir / "debug_traces.csv")).sink_parquet( + # str(results_dir / "debug_traces.parquet") + # ) + (results_dir / "debug_traces.csv").unlink() + + logger.info("Reading features/scores") + sage_features = pl.scan_csv(results_dir / "features.csv") + sage_features.sink_parquet(results_dir / "features.parquet") + (results_dir / "features.csv").unlink() + sage_features = pl.scan_parquet(results_dir / "features.parquet") + + # ['peptide', 'psm_id', 'peptide_len', 'spec_id', 'file_id', + # 'rank', 'label', 'expmass', 'calcmass', 'charge', 'rt', 'aligned_rt', 'predicted_rt', + # 'delta_rt_model', 'delta_mass', 'isotope_error', 'average_ppm', 'hyperscore', + # 'delta_next', 'delta_best', 'matched_peaks', 'longest_b', 'longest_y', 'longest_y_pct', + # 'missed_cleavages', 'matched_intensity_pct', 'scored_candidates', 'poisson', + # 'discriminant_score', 'posterior_error', 'spectrum_q', 'peptide_q', 'protein_q', + # 'ms2_intensity'] + + # Keep best scoring target for each peptide + TARTGET_SCORE = "hyperscore" + sage_features = sage_features.sort(TARTGET_SCORE, descending=True) + sage_features = sage_features.group_by("peptide").head(1) + + targets = sage_features.filter(pl.col("label") == 1) + decoys = sage_features.filter(pl.col("label") < 0) + + target_discriminant = targets.select(TARTGET_SCORE).collect() + target_discriminant_sum = target_discriminant[TARTGET_SCORE].sum() + decoy_discriminant = decoys.select(TARTGET_SCORE).collect() + decoy_discriminant_sum = decoy_discriminant[TARTGET_SCORE].sum() + + fig, ax = plt.subplots() + bins = np.histogram_bin_edges( + np.concatenate( + [ + target_discriminant[TARTGET_SCORE], + decoy_discriminant[TARTGET_SCORE], + ] + ), + bins=100, + ) + ax.hist( + target_discriminant[TARTGET_SCORE], + bins=bins, + alpha=0.5, + label="Target", + ) + ax.hist( + decoy_discriminant[TARTGET_SCORE], + bins=bins, + alpha=0.5, + label="Decoy", + ) + ax.legend() + plt.savefig(results_dir / f"{TARTGET_SCORE}_histogram.png") + + logger.info( + f"Target {TARTGET_SCORE}: {target_discriminant_sum} / {len(target_discriminant)}" + ) + logger.info( + f"Decoy {TARTGET_SCORE}: {decoy_discriminant_sum} / {len(decoy_discriminant)}" + ) + + delta_discriminant = target_discriminant_sum - decoy_discriminant_sum + + return delta_discriminant + + +def build_parser(): + parser = argparse.ArgumentParser() + parser.add_argument("--fasta", type=Path) + parser.add_argument("--base_config", type=Path) + parser.add_argument("--target_file", type=Path) + parser.add_argument("--ionmesh_executable", type=Path) + parser.add_argument("--output", type=Path, default=Path("tune_out")) + parser.add_argument("--name", type=str, default="tune") + return parser + + +def main(): + parser = build_parser() + args, unkargs = parser.parse_known_args() + if unkargs: + raise ValueError(f"Unknown arguments: {unkargs}") + + tuner = IonMeshTuner( + fasta=args.fasta, + base_config=args.base_config, + target_file=args.target_file, + ionmesh_executable=args.ionmesh_executable, + base_out_dir=args.output, + ) + + study = optuna.create_study( + storage=f"sqlite:///{args.name}.db", + study_name=args.name, + load_if_exists=True, + direction="maximize", + ) + study.optimize(tuner.objective, n_trials=100) + logger.info(study.best_params) + + +if __name__ == "__main__": + main()