From 7bbec6b2e454912f2835140e41438fa9d4ca8d6e Mon Sep 17 00:00:00 2001 From: Charlie Marsh Date: Wed, 1 Jan 2025 12:35:30 -0500 Subject: [PATCH] Parse URLs lazily in resolver (#10259) ## Summary The idea here is to avoid parsing all registry URLs upfront, and instead parse them when we need them. Closes #6133. --- crates/uv-client/src/error.rs | 2 +- crates/uv-client/src/registry_client.rs | 2 +- crates/uv-distribution-types/src/file.rs | 80 ++++++------------- .../src/distribution_database.rs | 2 +- crates/uv-distribution/src/error.rs | 4 +- crates/uv-distribution/src/source/mod.rs | 4 +- crates/uv-requirements/src/upgrade.rs | 2 +- crates/uv-resolver/src/lock/mod.rs | 47 ++++++----- .../uv-resolver/src/lock/requirements_txt.rs | 16 +++- 9 files changed, 70 insertions(+), 89 deletions(-) diff --git a/crates/uv-client/src/error.rs b/crates/uv-client/src/error.rs index 30df2fe0d769..b9b5a81c761f 100644 --- a/crates/uv-client/src/error.rs +++ b/crates/uv-client/src/error.rs @@ -139,7 +139,7 @@ impl From for Error { #[derive(Debug, thiserror::Error)] pub enum ErrorKind { #[error(transparent)] - UrlParse(#[from] url::ParseError), + InvalidUrl(#[from] uv_distribution_types::ToUrlError), #[error(transparent)] JoinRelativeUrl(#[from] uv_pypi_types::JoinRelativeError), diff --git a/crates/uv-client/src/registry_client.rs b/crates/uv-client/src/registry_client.rs index 0510b42c0386..c792a9882a84 100644 --- a/crates/uv-client/src/registry_client.rs +++ b/crates/uv-client/src/registry_client.rs @@ -474,7 +474,7 @@ impl RegistryClient { } } FileLocation::AbsoluteUrl(url) => { - let url = url.to_url(); + let url = url.to_url().map_err(ErrorKind::InvalidUrl)?; if url.scheme() == "file" { let path = url .to_file_path() diff --git a/crates/uv-distribution-types/src/file.rs b/crates/uv-distribution-types/src/file.rs index 10de4a9c6345..081cab1107d6 100644 --- a/crates/uv-distribution-types/src/file.rs +++ b/crates/uv-distribution-types/src/file.rs @@ -1,6 +1,4 @@ -use std::borrow::Cow; use std::fmt::{self, Display, Formatter}; -use std::path::PathBuf; use std::str::FromStr; use jiff::Timestamp; @@ -8,7 +6,7 @@ use serde::{Deserialize, Serialize}; use url::Url; use uv_pep440::{VersionSpecifiers, VersionSpecifiersParseError}; -use uv_pep508::VerbatimUrl; +use uv_pep508::split_scheme; use uv_pypi_types::{CoreMetadata, HashDigest, Yanked}; /// Error converting [`uv_pypi_types::File`] to [`distribution_type::File`]. @@ -56,9 +54,9 @@ impl File { .map_err(|err| FileConversionError::RequiresPython(err.line().clone(), err))?, size: file.size, upload_time_utc_ms: file.upload_time.map(Timestamp::as_millisecond), - url: match Url::parse(&file.url) { - Ok(url) => FileLocation::AbsoluteUrl(url.into()), - Err(_) => FileLocation::RelativeUrl(base.to_string(), file.url), + url: match split_scheme(&file.url) { + Some(..) => FileLocation::AbsoluteUrl(UrlString::new(file.url)), + None => FileLocation::RelativeUrl(base.to_string(), file.url), }, yanked: file.yanked, }) @@ -101,7 +99,7 @@ impl FileLocation { })?; Ok(joined) } - FileLocation::AbsoluteUrl(ref absolute) => Ok(absolute.to_url()), + FileLocation::AbsoluteUrl(ref absolute) => absolute.to_url(), } } @@ -128,7 +126,7 @@ impl Display for FileLocation { /// A [`Url`] represented as a `String`. /// -/// This type is guaranteed to be a valid URL but avoids being parsed into the [`Url`] type. +/// This type is not guaranteed to be a valid URL, and may error on conversion. #[derive( Debug, Clone, @@ -148,10 +146,17 @@ impl Display for FileLocation { pub struct UrlString(String); impl UrlString { + /// Create a new [`UrlString`] from a [`String`]. + pub fn new(url: String) -> Self { + Self(url) + } + /// Converts a [`UrlString`] to a [`Url`]. - pub fn to_url(&self) -> Url { - // This conversion can never fail as the only way to construct a `UrlString` is from a `Url`. - Url::from_str(&self.0).unwrap() + pub fn to_url(&self) -> Result { + Url::from_str(&self.0).map_err(|err| ToUrlError::InvalidAbsolute { + absolute: self.0.clone(), + err, + }) } /// Return the [`UrlString`] with any query parameters and fragments removed. @@ -194,42 +199,18 @@ impl From<&Url> for UrlString { } } -impl From> for UrlString { - fn from(value: Cow<'_, Url>) -> Self { - UrlString(value.to_string()) - } -} - -impl From for UrlString { - fn from(value: VerbatimUrl) -> Self { - UrlString(value.raw().to_string()) - } -} - -impl From<&VerbatimUrl> for UrlString { - fn from(value: &VerbatimUrl) -> Self { - UrlString(value.raw().to_string()) - } -} - -impl From for String { - fn from(value: UrlString) -> Self { - value.0 - } -} - impl Display for UrlString { fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result { fmt::Display::fmt(&self.0, f) } } -/// An error that occurs when a `FileLocation` is not a valid URL. +/// An error that occurs when a [`FileLocation`] is not a valid URL. #[derive(Clone, Debug, Eq, PartialEq, thiserror::Error)] pub enum ToUrlError { - /// An error that occurs when the base URL in `FileLocation::Relative` + /// An error that occurs when the base URL in [`FileLocation::Relative`] /// could not be parsed as a valid URL. - #[error("could not parse base URL `{base}` as a valid URL")] + #[error("Could not parse base URL `{base}` as a valid URL")] InvalidBase { /// The base URL that could not be parsed as a valid URL. base: String, @@ -238,8 +219,8 @@ pub enum ToUrlError { err: url::ParseError, }, /// An error that occurs when the base URL could not be joined with - /// the relative path in a `FileLocation::Relative`. - #[error("could not join base URL `{base}` to relative path `{path}`")] + /// the relative path in a [`FileLocation::Relative`]. + #[error("Could not join base URL `{base}` to relative path `{path}`")] InvalidJoin { /// The base URL that could not be parsed as a valid URL. base: String, @@ -249,9 +230,9 @@ pub enum ToUrlError { #[source] err: url::ParseError, }, - /// An error that occurs when the absolute URL in `FileLocation::Absolute` + /// An error that occurs when the absolute URL in [`FileLocation::Absolute`] /// could not be parsed as a valid URL. - #[error("could not parse absolute URL `{absolute}` as a valid URL")] + #[error("Could not parse absolute URL `{absolute}` as a valid URL")] InvalidAbsolute { /// The absolute URL that could not be parsed as a valid URL. absolute: String, @@ -259,21 +240,6 @@ pub enum ToUrlError { #[source] err: url::ParseError, }, - /// An error that occurs when the file path in `FileLocation::Path` is - /// not valid UTF-8. We need paths to be valid UTF-8 to be transformed - /// into URLs, which must also be UTF-8. - #[error("could not build URL from file path `{path}` because it is not valid UTF-8")] - PathNotUtf8 { - /// The original path that was not valid UTF-8. - path: PathBuf, - }, - /// An error that occurs when the file URL created from a file path is not - /// a valid URL. - #[error("could not parse file path `{path}` as a valid URL")] - InvalidPath { - /// The file path URL that could not be parsed as a valid URL. - path: String, - }, } #[cfg(test)] diff --git a/crates/uv-distribution/src/distribution_database.rs b/crates/uv-distribution/src/distribution_database.rs index 0904269199b6..3b0ab11a4cc2 100644 --- a/crates/uv-distribution/src/distribution_database.rs +++ b/crates/uv-distribution/src/distribution_database.rs @@ -187,7 +187,7 @@ impl<'a, Context: BuildContext> DistributionDatabase<'a, Context> { FileLocation::RelativeUrl(base, url) => { uv_pypi_types::base_url_join_relative(base, url)? } - FileLocation::AbsoluteUrl(url) => url.to_url(), + FileLocation::AbsoluteUrl(url) => url.to_url()?, }; // Create a cache entry for the wheel. diff --git a/crates/uv-distribution/src/error.rs b/crates/uv-distribution/src/error.rs index 84315658e42e..0727b8896927 100644 --- a/crates/uv-distribution/src/error.rs +++ b/crates/uv-distribution/src/error.rs @@ -21,11 +21,11 @@ pub enum Error { NoBuild, // Network error - #[error("Failed to parse URL: {0}")] - Url(String, #[source] url::ParseError), #[error("Expected an absolute path, but received: {}", _0.user_display())] RelativePath(PathBuf), #[error(transparent)] + InvalidUrl(#[from] uv_distribution_types::ToUrlError), + #[error(transparent)] ParsedUrl(#[from] ParsedUrlError), #[error(transparent)] JoinRelativeUrl(#[from] uv_pypi_types::JoinRelativeError), diff --git a/crates/uv-distribution/src/source/mod.rs b/crates/uv-distribution/src/source/mod.rs index 62164aabb5d8..772c3d384442 100644 --- a/crates/uv-distribution/src/source/mod.rs +++ b/crates/uv-distribution/src/source/mod.rs @@ -114,7 +114,7 @@ impl<'a, T: BuildContext> SourceDistributionBuilder<'a, T> { FileLocation::RelativeUrl(base, url) => { uv_pypi_types::base_url_join_relative(base, url)? } - FileLocation::AbsoluteUrl(url) => url.to_url(), + FileLocation::AbsoluteUrl(url) => url.to_url()?, }; // If the URL is a file URL, use the local path directly. @@ -263,7 +263,7 @@ impl<'a, T: BuildContext> SourceDistributionBuilder<'a, T> { FileLocation::RelativeUrl(base, url) => { uv_pypi_types::base_url_join_relative(base, url)? } - FileLocation::AbsoluteUrl(url) => url.to_url(), + FileLocation::AbsoluteUrl(url) => url.to_url()?, }; // If the URL is a file URL, use the local path directly. diff --git a/crates/uv-requirements/src/upgrade.rs b/crates/uv-requirements/src/upgrade.rs index 8a472b373b94..7f7556e67d74 100644 --- a/crates/uv-requirements/src/upgrade.rs +++ b/crates/uv-requirements/src/upgrade.rs @@ -81,7 +81,7 @@ pub fn read_lock_requirements( preferences.push(Preference::from_lock(package, install_path)?); // Map each entry in the lockfile to a Git SHA. - if let Some(git_ref) = package.as_git_ref() { + if let Some(git_ref) = package.as_git_ref()? { git.push(git_ref); } } diff --git a/crates/uv-resolver/src/lock/mod.rs b/crates/uv-resolver/src/lock/mod.rs index 7d85bcd6cd7b..49f95bb7909e 100644 --- a/crates/uv-resolver/src/lock/mod.rs +++ b/crates/uv-resolver/src/lock/mod.rs @@ -1098,7 +1098,7 @@ impl Lock { .into_iter() .filter_map(|index| match index.url() { IndexUrl::Pypi(_) | IndexUrl::Url(_) => { - Some(UrlString::from(index.url().redacted())) + Some(UrlString::from(index.url().redacted().as_ref())) } IndexUrl::Path(_) => None, }) @@ -1814,7 +1814,7 @@ impl Package { let filename: WheelFilename = self.wheels[best_wheel_index].filename.clone(); let url = Url::from(ParsedArchiveUrl { - url: url.to_url(), + url: url.to_url().map_err(LockErrorKind::InvalidUrl)?, subdirectory: direct.subdirectory.clone(), ext: DistExtension::Wheel, }); @@ -1946,7 +1946,7 @@ impl Package { Source::Git(url, git) => { // Remove the fragment and query from the URL; they're already present in the // `GitSource`. - let mut url = url.to_url(); + let mut url = url.to_url().map_err(LockErrorKind::InvalidUrl)?; url.set_fragment(None); url.set_query(None); @@ -1976,7 +1976,7 @@ impl Package { let DistExtension::Source(ext) = DistExtension::from_path(url.as_ref())? else { return Ok(None); }; - let location = url.to_url(); + let location = url.to_url().map_err(LockErrorKind::InvalidUrl)?; let subdirectory = direct.subdirectory.as_ref().map(PathBuf::from); let url = Url::from(ParsedArchiveUrl { url: location.clone(), @@ -2020,7 +2020,10 @@ impl Package { url: FileLocation::AbsoluteUrl(file_url.clone()), yanked: None, }); - let index = IndexUrl::from(VerbatimUrl::from_url(url.to_url())); + + let index = IndexUrl::from(VerbatimUrl::from_url( + url.to_url().map_err(LockErrorKind::InvalidUrl)?, + )); let reg_dist = RegistrySourceDist { name: self.id.name.clone(), @@ -2262,7 +2265,9 @@ impl Package { pub fn index(&self, root: &Path) -> Result, LockError> { match &self.id.source { Source::Registry(RegistrySource::Url(url)) => { - let index = IndexUrl::from(VerbatimUrl::from_url(url.to_url())); + let index = IndexUrl::from(VerbatimUrl::from_url( + url.to_url().map_err(LockErrorKind::InvalidUrl)?, + )); Ok(Some(index)) } Source::Registry(RegistrySource::Path(path)) => { @@ -2291,16 +2296,16 @@ impl Package { } /// Returns the [`ResolvedRepositoryReference`] for the package, if it is a Git source. - pub fn as_git_ref(&self) -> Option { + pub fn as_git_ref(&self) -> Result, LockError> { match &self.id.source { - Source::Git(url, git) => Some(ResolvedRepositoryReference { + Source::Git(url, git) => Ok(Some(ResolvedRepositoryReference { reference: RepositoryReference { - url: RepositoryUrl::new(&url.to_url()), + url: RepositoryUrl::new(&url.to_url().map_err(LockErrorKind::InvalidUrl)?), reference: GitReference::from(git.kind.clone()), }, sha: git.precise, - }), - _ => None, + })), + _ => Ok(None), } } } @@ -3138,7 +3143,7 @@ impl SourceDist { match ®_dist.index { IndexUrl::Pypi(_) | IndexUrl::Url(_) => { let url = normalize_file_location(®_dist.file.url) - .map_err(LockErrorKind::InvalidFileUrl) + .map_err(LockErrorKind::InvalidUrl) .map_err(LockError::from)?; let hash = reg_dist.file.hashes.iter().max().cloned().map(Hash::from); let size = reg_dist.file.size; @@ -3153,7 +3158,7 @@ impl SourceDist { .file .url .to_url() - .map_err(LockErrorKind::InvalidFileUrl)? + .map_err(LockErrorKind::InvalidUrl)? .to_file_path() .map_err(|()| LockErrorKind::UrlToPath)?; let path = relative_to(®_dist_path, index_path) @@ -3444,7 +3449,7 @@ impl Wheel { match &wheel.index { IndexUrl::Pypi(_) | IndexUrl::Url(_) => { let url = normalize_file_location(&wheel.file.url) - .map_err(LockErrorKind::InvalidFileUrl) + .map_err(LockErrorKind::InvalidUrl) .map_err(LockError::from)?; let hash = wheel.file.hashes.iter().max().cloned().map(Hash::from); let size = wheel.file.size; @@ -3461,7 +3466,7 @@ impl Wheel { .file .url .to_url() - .map_err(LockErrorKind::InvalidFileUrl)? + .map_err(LockErrorKind::InvalidUrl)? .to_file_path() .map_err(|()| LockErrorKind::UrlToPath)?; let path = relative_to(&wheel_path, index_path) @@ -3507,7 +3512,7 @@ impl Wheel { let filename: WheelFilename = self.filename.clone(); match source { - RegistrySource::Url(index_url) => { + RegistrySource::Url(url) => { let file_url = match &self.url { WheelWireSource::Url { url } => url, WheelWireSource::Path { .. } | WheelWireSource::Filename { .. } => { @@ -3528,7 +3533,9 @@ impl Wheel { url: FileLocation::AbsoluteUrl(file_url.clone()), yanked: None, }); - let index = IndexUrl::from(VerbatimUrl::from_url(index_url.to_url())); + let index = IndexUrl::from(VerbatimUrl::from_url( + url.to_url().map_err(LockErrorKind::InvalidUrl)?, + )); Ok(RegistryBuiltWheel { filename, file, @@ -4094,11 +4101,11 @@ enum LockErrorKind { }, /// An error that occurs when the URL to a file for a wheel or /// source dist could not be converted to a structured `url::Url`. - #[error("Failed to parse wheel or source distribution URL")] - InvalidFileUrl( + #[error(transparent)] + InvalidUrl( /// The underlying error that occurred. This includes the /// errant URL in its error message. - #[source] + #[from] ToUrlError, ), /// An error that occurs when the extension can't be determined diff --git a/crates/uv-resolver/src/lock/requirements_txt.rs b/crates/uv-resolver/src/lock/requirements_txt.rs index 7290f61ba20b..694064727e34 100644 --- a/crates/uv-resolver/src/lock/requirements_txt.rs +++ b/crates/uv-resolver/src/lock/requirements_txt.rs @@ -303,7 +303,7 @@ impl std::fmt::Display for RequirementsTxtExport<'_> { Source::Git(url, git) => { // Remove the fragment and query from the URL; they're already present in the // `GitSource`. - let mut url = url.to_url(); + let mut url = url.to_url().map_err(|_| std::fmt::Error)?; url.set_fragment(None); url.set_query(None); @@ -325,7 +325,7 @@ impl std::fmt::Display for RequirementsTxtExport<'_> { Source::Direct(url, direct) => { let subdirectory = direct.subdirectory.as_ref().map(PathBuf::from); let url = Url::from(ParsedArchiveUrl { - url: url.to_url(), + url: url.to_url().map_err(|_| std::fmt::Error)?, subdirectory: subdirectory.clone(), ext: DistExtension::Source(SourceDistExtension::TarGz), }); @@ -333,7 +333,11 @@ impl std::fmt::Display for RequirementsTxtExport<'_> { } Source::Path(path) | Source::Directory(path) => { if path.is_absolute() { - write!(f, "{}", Url::from_file_path(path).unwrap())?; + write!( + f, + "{}", + Url::from_file_path(path).map_err(|()| std::fmt::Error)? + )?; } else { write!(f, "{}", anchor(path).portable_display())?; } @@ -344,7 +348,11 @@ impl std::fmt::Display for RequirementsTxtExport<'_> { } EditableMode::NonEditable => { if path.is_absolute() { - write!(f, "{}", Url::from_file_path(path).unwrap())?; + write!( + f, + "{}", + Url::from_file_path(path).map_err(|()| std::fmt::Error)? + )?; } else { write!(f, "{}", anchor(path).portable_display())?; }