From d397b375d8453daaca78c61d36eec070e3ee9ac2 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Mon, 9 Dec 2024 02:49:24 +0000 Subject: [PATCH 1/5] Update kiddo requirement from 4.2 to 5.0 Updates the requirements on [kiddo](https://github.com/sdd/kiddo) to permit the latest version. - [Release notes](https://github.com/sdd/kiddo/releases) - [Changelog](https://github.com/sdd/kiddo/blob/master/CHANGELOG.md) - [Commits](https://github.com/sdd/kiddo/compare/v4.2.0...v4.2.1) --- updated-dependencies: - dependency-name: kiddo dependency-type: direct:production ... Signed-off-by: dependabot[bot] --- Cargo.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Cargo.toml b/Cargo.toml index f316672..b747a1e 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -23,7 +23,7 @@ config = "0.14" csv = "1" rayon = "1" strsim = "0.11" -kiddo = { version = "4.2", features = ["immutable"] } +kiddo = { version = "5.0", features = ["immutable"] } geoip2 = "0.1.7" bincode = "1.3.3" From 55adddab864d8a47742d0dedb1dfaa8e4bf5b853 Mon Sep 17 00:00:00 2001 From: Evgeniy Tatarkin Date: Mon, 9 Dec 2024 19:55:53 +0300 Subject: [PATCH 2/5] fix: use tree index to geonames id map --- Cargo.toml | 2 +- geosuggest-core/src/lib.rs | 69 +++++++++++++++++++++++++++++------- geosuggest-core/tests/lib.rs | 1 - 3 files changed, 58 insertions(+), 14 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index b747a1e..76a7f91 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -23,7 +23,7 @@ config = "0.14" csv = "1" rayon = "1" strsim = "0.11" -kiddo = { version = "5.0", features = ["immutable"] } +kiddo = { version = "5.0", default-features = false } geoip2 = "0.1.7" bincode = "1.3.3" diff --git a/geosuggest-core/src/lib.rs b/geosuggest-core/src/lib.rs index 0de21ef..6377df6 100644 --- a/geosuggest-core/src/lib.rs +++ b/geosuggest-core/src/lib.rs @@ -7,7 +7,9 @@ use std::time::Instant; use itertools::Itertools; -use kiddo::{self, float::kdtree::KdTree, SquaredEuclidean}; +use kiddo::{self, SquaredEuclidean}; + +use kiddo::immutable::float::kdtree::ImmutableKdTree; use rayon::prelude::*; use serde::{Deserialize, Serialize}; @@ -281,7 +283,9 @@ pub struct Engine { pub metadata: Option, #[serde(skip_serializing)] - tree: KdTree, + tree_index_to_geonameid: HashMap, + #[serde(skip_serializing)] + tree: ImmutableKdTree, #[cfg(feature = "geoip2_support")] #[serde(skip_serializing)] @@ -404,13 +408,13 @@ impl Engine { return None; } - let nearest_limit = if countries.is_some() { + let nearest_limit = std::num::NonZero::new(if countries.is_some() { // ugly hack try to fetch nearest cities in requested countries // much better is to build index for concrete countries self.geonames.len() } else { limit - }; + })?; let mut i1; let mut i2; @@ -428,7 +432,8 @@ impl Engine { .collect::>(); i1 = items.iter_mut().filter_map(move |nearest| { - let city = self.geonames.get(&nearest.item)?; + let geonameid = self.tree_index_to_geonameid.get(&(nearest.item as usize))?; + let city = self.geonames.get(geonameid)?; let country = city.country.as_ref()?; if countries.contains(&country.code) { Some((nearest, city)) @@ -439,7 +444,8 @@ impl Engine { &mut i1 } else { i2 = items.iter_mut().filter_map(|nearest| { - let city = self.geonames.get(&nearest.item)?; + let geonameid = self.tree_index_to_geonameid.get(&(nearest.item as usize))?; + let city = self.geonames.get(geonameid)?; Some((nearest, city)) }); &mut i2 @@ -843,7 +849,7 @@ impl Engine { None => None, }; - let mut tree = KdTree::new(); + let mut tree = Vec::with_capacity(records.len()); for record in records { // INCLUDE: @@ -880,7 +886,8 @@ impl Engine { continue; } - tree.add(&[record.latitude, record.longitude], record.geonameid); + // tree.add(&[record.latitude, record.longitude], record.geonameid); + tree.push((record.geonameid, [record.latitude, record.longitude])); let country_id = country_by_code .as_ref() @@ -991,8 +998,26 @@ impl Engine { ); } + tree.sort_unstable_by_key(|item| item.0); + + let tree_index_to_geonameid = HashMap::from_iter( + tree.clone() + .iter() + .enumerate() + .map(|(index, item)| (index, item.0)), + ); + + let tree = ImmutableKdTree::new_from_slice( + tree.clone() + .into_iter() + .map(|item| item.1) + .collect::>() + .as_slice(), + ); + let engine = Engine { geonames, + tree_index_to_geonameid, tree, entries, metadata: None, @@ -1112,16 +1137,36 @@ impl std::fmt::Display for GeoIP2Error { impl From for Engine { fn from(engine_dump: EngineDump) -> Engine { - let mut tree = KdTree::new(); - for (geonameid, record) in &engine_dump.geonames { - tree.add(&[record.latitude, record.longitude], *geonameid); - } + let mut items = engine_dump + .geonames + .clone() + .into_values() + .map(|record| (record.id, [record.latitude, record.longitude])) + .collect::>(); + + items.sort_unstable_by_key(|item| item.0); + + let tree_index_to_geonameid = HashMap::from_iter( + items + .clone() + .iter() + .enumerate() + .map(|(index, item)| (index, item.0)), + ); + let tree = ImmutableKdTree::new_from_slice( + items + .into_iter() + .map(|item| item.1) + .collect::>() + .as_slice(), + ); Engine { entries: engine_dump.entries, geonames: engine_dump.geonames, capitals: engine_dump.capitals, country_info_by_code: engine_dump.country_info_by_code, + tree_index_to_geonameid, tree, metadata: engine_dump.metadata, #[cfg(feature = "geoip2_support")] diff --git a/geosuggest-core/tests/lib.rs b/geosuggest-core/tests/lib.rs index 1b18d6a..fbefe6f 100644 --- a/geosuggest-core/tests/lib.rs +++ b/geosuggest-core/tests/lib.rs @@ -4,7 +4,6 @@ use geosuggest_core::{ }; use std::{env::temp_dir, error::Error}; -#[cfg(feature = "geoip2_support")] #[cfg(feature = "geoip2_support")] use std::{net::IpAddr, str::FromStr}; From 971cc42a0bbd21f53f18aded5299db2cda21301c Mon Sep 17 00:00:00 2001 From: Evgeniy Tatarkin Date: Mon, 9 Dec 2024 22:09:15 +0300 Subject: [PATCH 3/5] fix: less allocations on index build and read --- geosuggest-core/src/lib.rs | 99 +++++++++++++++++++------------------- 1 file changed, 49 insertions(+), 50 deletions(-) diff --git a/geosuggest-core/src/lib.rs b/geosuggest-core/src/lib.rs index 6377df6..aef6a15 100644 --- a/geosuggest-core/src/lib.rs +++ b/geosuggest-core/src/lib.rs @@ -545,10 +545,6 @@ impl Engine { #[cfg(feature = "tracing")] let now = Instant::now(); - let mut entries: Vec = Vec::new(); - let mut geonames: HashMap = HashMap::new(); - let mut capitals: HashMap = HashMap::new(); - let records = split_content_to_n_parts(&cities, rayon::current_num_threads()) .par_iter() .map(|chunk| { @@ -569,6 +565,16 @@ impl Engine { m1 }); + let mut geonames: Vec = Vec::with_capacity(records.len()); + let mut entries: Vec = Vec::with_capacity( + records.len() + * if !filter_languages.is_empty() { + filter_languages.len() + } else { + 1 + }, + ); + #[cfg(feature = "tracing")] tracing::info!( "Engine read {} cities took {}ms", @@ -849,7 +855,12 @@ impl Engine { None => None, }; - let mut tree = Vec::with_capacity(records.len()); + let mut capitals: HashMap = + HashMap::with_capacity(if let Some(items) = &country_by_code { + items.len() + } else { + 0 + }); for record in records { // INCLUDE: @@ -881,14 +892,6 @@ impl Engine { let is_capital = feature_code == "PPLC"; - // prevent duplicates - if geonames.contains_key(&record.geonameid) { - continue; - } - - // tree.add(&[record.latitude, record.longitude], record.geonameid); - tree.push((record.geonameid, [record.latitude, record.longitude])); - let country_id = country_by_code .as_ref() .and_then(|m| m.get(&record.country_code).map(|c| c.geonameid)); @@ -967,56 +970,53 @@ impl Engine { } else { None }; - - geonames.insert( - record.geonameid, - CitiesRecord { - id: record.geonameid, - name: record.name, - country: country.as_ref().map(Country::from), - admin_division, - admin2_division, - latitude: record.latitude, - longitude: record.longitude, - timezone: record.timezone, - names: match names_by_id { - Some(ref mut names) => { - if is_capital { - names.get(&record.geonameid).cloned() - } else { - // don't hold unused data - names.remove(&record.geonameid) - } + geonames.push(CitiesRecord { + id: record.geonameid, + name: record.name, + country: country.as_ref().map(Country::from), + admin_division, + admin2_division, + latitude: record.latitude, + longitude: record.longitude, + timezone: record.timezone, + names: match names_by_id { + Some(ref mut names) => { + if is_capital { + names.get(&record.geonameid).cloned() + } else { + // don't hold unused data + names.remove(&record.geonameid) } - None => None, - }, - country_names, - admin1_names, - admin2_names, - population: record.population, + } + None => None, }, - ); + country_names, + admin1_names, + admin2_names, + population: record.population, + }); } - tree.sort_unstable_by_key(|item| item.0); + geonames.sort_unstable_by_key(|item| item.id); + geonames.dedup_by_key(|item| item.id); let tree_index_to_geonameid = HashMap::from_iter( - tree.clone() + geonames .iter() .enumerate() - .map(|(index, item)| (index, item.0)), + .map(|(index, item)| (index, item.id)), ); let tree = ImmutableKdTree::new_from_slice( - tree.clone() - .into_iter() - .map(|item| item.1) + geonames + .iter() + .map(|item| [item.latitude, item.longitude]) .collect::>() .as_slice(), ); let engine = Engine { - geonames, + geonames: HashMap::from_iter(geonames.into_iter().map(|item| (item.id, item))), tree_index_to_geonameid, tree, entries, @@ -1139,16 +1139,15 @@ impl From for Engine { fn from(engine_dump: EngineDump) -> Engine { let mut items = engine_dump .geonames - .clone() - .into_values() + .values() .map(|record| (record.id, [record.latitude, record.longitude])) .collect::>(); items.sort_unstable_by_key(|item| item.0); + items.dedup_by_key(|item| item.0); let tree_index_to_geonameid = HashMap::from_iter( items - .clone() .iter() .enumerate() .map(|(index, item)| (index, item.0)), From 616c0a88f85af872c255605159cc609b54e45364 Mon Sep 17 00:00:00 2001 From: Evgeniy Tatarkin Date: Tue, 10 Dec 2024 11:33:55 +0300 Subject: [PATCH 4/5] ci: try to use actions-rust-lang --- .github/workflows/rust.yml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/.github/workflows/rust.yml b/.github/workflows/rust.yml index 58550fa..275f7b8 100644 --- a/.github/workflows/rust.yml +++ b/.github/workflows/rust.yml @@ -10,8 +10,9 @@ jobs: runs-on: ubuntu-latest steps: - uses: actions/checkout@v4 - - uses: dtolnay/rust-toolchain@stable + - uses: actions-rust-lang/setup-rust-toolchain@v1 with: + toolchain: stable components: clippy - run: cargo clippy --workspace --all-features - run: cargo test --workspace --all-features From 3dacc8bbb76e465afeb52e43f05a4c8a4d64ef55 Mon Sep 17 00:00:00 2001 From: Evgeniy Tatarkin Date: Tue, 10 Dec 2024 11:43:44 +0300 Subject: [PATCH 5/5] ci: try don't use --all-features --- .github/workflows/rust.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/rust.yml b/.github/workflows/rust.yml index 275f7b8..476e6e8 100644 --- a/.github/workflows/rust.yml +++ b/.github/workflows/rust.yml @@ -14,6 +14,6 @@ jobs: with: toolchain: stable components: clippy - - run: cargo clippy --workspace --all-features - - run: cargo test --workspace --all-features + - run: cargo clippy --workspace --no-default-features --features="tokio,geoip2_support,tracing" + - run: cargo test --workspace --no-default-features --features="tokio,geoip2_support,tracing" - run: cargo run -p geosuggest-examples --release --bin simple