diff --git a/.github/workflows/rust.yml b/.github/workflows/rust.yml index 58550fa..476e6e8 100644 --- a/.github/workflows/rust.yml +++ b/.github/workflows/rust.yml @@ -10,9 +10,10 @@ jobs: runs-on: ubuntu-latest steps: - uses: actions/checkout@v4 - - uses: dtolnay/rust-toolchain@stable + - uses: actions-rust-lang/setup-rust-toolchain@v1 with: + toolchain: stable components: clippy - - run: cargo clippy --workspace --all-features - - run: cargo test --workspace --all-features + - run: cargo clippy --workspace --no-default-features --features="tokio,geoip2_support,tracing" + - run: cargo test --workspace --no-default-features --features="tokio,geoip2_support,tracing" - run: cargo run -p geosuggest-examples --release --bin simple diff --git a/Cargo.toml b/Cargo.toml index f316672..76a7f91 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -23,7 +23,7 @@ config = "0.14" csv = "1" rayon = "1" strsim = "0.11" -kiddo = { version = "4.2", features = ["immutable"] } +kiddo = { version = "5.0", default-features = false } geoip2 = "0.1.7" bincode = "1.3.3" diff --git a/geosuggest-core/src/lib.rs b/geosuggest-core/src/lib.rs index 0de21ef..aef6a15 100644 --- a/geosuggest-core/src/lib.rs +++ b/geosuggest-core/src/lib.rs @@ -7,7 +7,9 @@ use std::time::Instant; use itertools::Itertools; -use kiddo::{self, float::kdtree::KdTree, SquaredEuclidean}; +use kiddo::{self, SquaredEuclidean}; + +use kiddo::immutable::float::kdtree::ImmutableKdTree; use rayon::prelude::*; use serde::{Deserialize, Serialize}; @@ -281,7 +283,9 @@ pub struct Engine { pub metadata: Option, #[serde(skip_serializing)] - tree: KdTree, + tree_index_to_geonameid: HashMap, + #[serde(skip_serializing)] + tree: ImmutableKdTree, #[cfg(feature = "geoip2_support")] #[serde(skip_serializing)] @@ -404,13 +408,13 @@ impl Engine { return None; } - let nearest_limit = if countries.is_some() { + let nearest_limit = std::num::NonZero::new(if countries.is_some() { // ugly hack try to fetch nearest cities in requested countries // much better is to build index for concrete countries self.geonames.len() } else { limit - }; + })?; let mut i1; let mut i2; @@ -428,7 +432,8 @@ impl Engine { .collect::>(); i1 = items.iter_mut().filter_map(move |nearest| { - let city = self.geonames.get(&nearest.item)?; + let geonameid = self.tree_index_to_geonameid.get(&(nearest.item as usize))?; + let city = self.geonames.get(geonameid)?; let country = city.country.as_ref()?; if countries.contains(&country.code) { Some((nearest, city)) @@ -439,7 +444,8 @@ impl Engine { &mut i1 } else { i2 = items.iter_mut().filter_map(|nearest| { - let city = self.geonames.get(&nearest.item)?; + let geonameid = self.tree_index_to_geonameid.get(&(nearest.item as usize))?; + let city = self.geonames.get(geonameid)?; Some((nearest, city)) }); &mut i2 @@ -539,10 +545,6 @@ impl Engine { #[cfg(feature = "tracing")] let now = Instant::now(); - let mut entries: Vec = Vec::new(); - let mut geonames: HashMap = HashMap::new(); - let mut capitals: HashMap = HashMap::new(); - let records = split_content_to_n_parts(&cities, rayon::current_num_threads()) .par_iter() .map(|chunk| { @@ -563,6 +565,16 @@ impl Engine { m1 }); + let mut geonames: Vec = Vec::with_capacity(records.len()); + let mut entries: Vec = Vec::with_capacity( + records.len() + * if !filter_languages.is_empty() { + filter_languages.len() + } else { + 1 + }, + ); + #[cfg(feature = "tracing")] tracing::info!( "Engine read {} cities took {}ms", @@ -843,7 +855,12 @@ impl Engine { None => None, }; - let mut tree = KdTree::new(); + let mut capitals: HashMap = + HashMap::with_capacity(if let Some(items) = &country_by_code { + items.len() + } else { + 0 + }); for record in records { // INCLUDE: @@ -875,13 +892,6 @@ impl Engine { let is_capital = feature_code == "PPLC"; - // prevent duplicates - if geonames.contains_key(&record.geonameid) { - continue; - } - - tree.add(&[record.latitude, record.longitude], record.geonameid); - let country_id = country_by_code .as_ref() .and_then(|m| m.get(&record.country_code).map(|c| c.geonameid)); @@ -960,39 +970,54 @@ impl Engine { } else { None }; - - geonames.insert( - record.geonameid, - CitiesRecord { - id: record.geonameid, - name: record.name, - country: country.as_ref().map(Country::from), - admin_division, - admin2_division, - latitude: record.latitude, - longitude: record.longitude, - timezone: record.timezone, - names: match names_by_id { - Some(ref mut names) => { - if is_capital { - names.get(&record.geonameid).cloned() - } else { - // don't hold unused data - names.remove(&record.geonameid) - } + geonames.push(CitiesRecord { + id: record.geonameid, + name: record.name, + country: country.as_ref().map(Country::from), + admin_division, + admin2_division, + latitude: record.latitude, + longitude: record.longitude, + timezone: record.timezone, + names: match names_by_id { + Some(ref mut names) => { + if is_capital { + names.get(&record.geonameid).cloned() + } else { + // don't hold unused data + names.remove(&record.geonameid) } - None => None, - }, - country_names, - admin1_names, - admin2_names, - population: record.population, + } + None => None, }, - ); + country_names, + admin1_names, + admin2_names, + population: record.population, + }); } + geonames.sort_unstable_by_key(|item| item.id); + geonames.dedup_by_key(|item| item.id); + + let tree_index_to_geonameid = HashMap::from_iter( + geonames + .iter() + .enumerate() + .map(|(index, item)| (index, item.id)), + ); + + let tree = ImmutableKdTree::new_from_slice( + geonames + .iter() + .map(|item| [item.latitude, item.longitude]) + .collect::>() + .as_slice(), + ); + let engine = Engine { - geonames, + geonames: HashMap::from_iter(geonames.into_iter().map(|item| (item.id, item))), + tree_index_to_geonameid, tree, entries, metadata: None, @@ -1112,16 +1137,35 @@ impl std::fmt::Display for GeoIP2Error { impl From for Engine { fn from(engine_dump: EngineDump) -> Engine { - let mut tree = KdTree::new(); - for (geonameid, record) in &engine_dump.geonames { - tree.add(&[record.latitude, record.longitude], *geonameid); - } + let mut items = engine_dump + .geonames + .values() + .map(|record| (record.id, [record.latitude, record.longitude])) + .collect::>(); + + items.sort_unstable_by_key(|item| item.0); + items.dedup_by_key(|item| item.0); + + let tree_index_to_geonameid = HashMap::from_iter( + items + .iter() + .enumerate() + .map(|(index, item)| (index, item.0)), + ); + let tree = ImmutableKdTree::new_from_slice( + items + .into_iter() + .map(|item| item.1) + .collect::>() + .as_slice(), + ); Engine { entries: engine_dump.entries, geonames: engine_dump.geonames, capitals: engine_dump.capitals, country_info_by_code: engine_dump.country_info_by_code, + tree_index_to_geonameid, tree, metadata: engine_dump.metadata, #[cfg(feature = "geoip2_support")] diff --git a/geosuggest-core/tests/lib.rs b/geosuggest-core/tests/lib.rs index 1b18d6a..fbefe6f 100644 --- a/geosuggest-core/tests/lib.rs +++ b/geosuggest-core/tests/lib.rs @@ -4,7 +4,6 @@ use geosuggest_core::{ }; use std::{env::temp_dir, error::Error}; -#[cfg(feature = "geoip2_support")] #[cfg(feature = "geoip2_support")] use std::{net::IpAddr, str::FromStr};