Skip to content

Commit

Permalink
Merge pull request #288 from asfadmin/feature-find-urls-ext
Browse files Browse the repository at this point in the history
Feature: Find Urls by Extensions and Patterns
  • Loading branch information
SpicyGarlicAlbacoreRoll authored Jan 13, 2025
2 parents 147da5e + 86668c7 commit 1414e2e
Show file tree
Hide file tree
Showing 7 changed files with 126 additions and 22 deletions.
2 changes: 2 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,8 @@ and uses [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
-->
------
## [v8.1.0](https://github.com/asfadmin/Discovery-asf_search/compare/v8.0.1...v8.1.0)
### Added
- Adds `ASFSearchResults.find_urls()` and `ASFProduct.find_urls()` to gather urls/uris from results by extension and/or regex pattern
### Changed
- Changed log level from warning to debug/info for search timing log messages
- Raised minimum Python version to 3.9 from 3.8, which reached EOL last year (see the official [Status of Python versions](https://devguide.python.org/versions/) for the Python version release cycle)
Expand Down
55 changes: 44 additions & 11 deletions asf_search/ASFProduct.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
import warnings
from shapely.geometry import shape, Point, Polygon, mapping
import json
import re

from urllib import parse

Expand Down Expand Up @@ -264,7 +265,8 @@ def get_stack_opts(self, opts: ASFSearchOptions = None) -> ASFSearchOptions:
return None

def _get_access_urls(
self, url_types: List[str] = ['GET DATA', 'EXTENDED METADATA']
self,
url_types: List[str] = ['GET DATA', 'EXTENDED METADATA']
) -> List[str]:
accessUrls = []

Expand All @@ -274,23 +276,54 @@ def _get_access_urls(

return sorted(list(set(accessUrls)))

def _get_urls(self) -> List[str]:
"""Finds and returns all umm urls"""
urls = self._get_access_urls(
['GET DATA', 'EXTENDED METADATA', 'GET DATA VIA DIRECT ACCESS', 'GET RELATED VISUALIZATION', 'VIEW RELATED INFORMATION']
)
return [
url for url in urls if not url.startswith('s3://')
]

def _get_s3_uris(self) -> List[str]:
"""Finds and returns all umm S3 direct access uris"""
s3_urls = self._get_access_urls(
['GET DATA', 'EXTENDED METADATA', 'GET DATA VIA DIRECT ACCESS', 'GET RELATED VISUALIZATION', 'VIEW RELATED INFORMATION']
)
return [url for url in s3_urls if url.startswith('s3://')]

def _get_additional_urls(self) -> List[str]:
accessUrls = self._get_access_urls(['GET DATA', 'EXTENDED METADATA'])
"""Finds and returns all non-md5/image urls and filters out the existing `url` property"""
access_urls = self._get_urls()
return [
url
for url in accessUrls
url for url in access_urls
if not url.endswith('.md5')
and not url.startswith('s3://')
and 's3credentials' not in url
and not url.endswith('.png')
and url != self.properties['url']
and 's3credentials' not in url
]

def _get_s3_urls(self) -> List[str]:
s3_urls = self._get_access_urls(
['GET DATA', 'EXTENDED METADATA', 'GET DATA VIA DIRECT ACCESS']
)
return [url for url in s3_urls if url.startswith('s3://')]
def find_urls(self, extension: str = None, pattern: str = r'.*', directAccess: bool = False) -> List[str]:
"""
Searches for all urls matching a given extension and/or pattern
param extension: the file extension to search for. (Defaults to `None`)
- Example: '.tiff'
param pattern: A regex pattern to search each url for.(Defaults to `False`)
- Example: `r'(QA_)+'` to find urls with 'QA_' at least once
param directAccess: should search in s3 bucket urls (Defaults to `False`)
"""
search_list = self._get_s3_uris() if directAccess else self._get_urls()

def _get_extension(file_url: str):
path = parse.urlparse(file_url).path
return os.path.splitext(path)[-1]

if extension is not None:
search_list = [url for url in search_list if _get_extension(url) == extension]

regexp = re.compile(pattern=pattern)

return sorted([url for url in search_list if regexp.search(url) is not None])

def centroid(self) -> Point:
"""
Expand Down
16 changes: 16 additions & 0 deletions asf_search/ASFSearchResults.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
from collections import UserList
from multiprocessing import Pool
import json
from typing import List
from asf_search import ASFSession, ASFSearchOptions
from asf_search.download.file_download_type import FileDownloadType
from asf_search.exceptions import ASFSearchError
Expand Down Expand Up @@ -42,6 +43,21 @@ def jsonlite(self):
def jsonlite2(self):
return results_to_jsonlite2(self)

def find_urls(self, extension: str = None, pattern: str = r'.*', directAccess: bool = False) -> List[str]:
"""Returns a flat list of all https or s3 urls from all results matching an extension and/or regex pattern
param extension: the file extension to search for. (Defaults to `None`)
- Example: '.tiff'
param pattern: A regex pattern to search each url for.(Defaults to `False`)
- Example: `r'(QA_)+'` to find urls with 'QA_' at least once
param directAccess: should search in s3 bucket urls (Defaults to `False`)
"""
urls = []

for product in self:
urls.extend(product.find_urls(extension=extension, pattern=pattern, directAccess=directAccess))

return sorted(list(set(urls)))

def __str__(self):
return json.dumps(self.geojson(), indent=2, sort_keys=True)

Expand Down
2 changes: 1 addition & 1 deletion asf_search/Products/NISARProduct.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ def __init__(self, args: Dict = {}, session: ASFSession = ASFSession()):
super().__init__(args, session)

self.properties['additionalUrls'] = self._get_additional_urls()
self.properties['s3Urls'] = self._get_s3_urls()
self.properties['s3Urls'] = self._get_s3_uris()

if self.properties.get('groupID') is None:
self.properties['groupID'] = self.properties['sceneName']
Expand Down
2 changes: 1 addition & 1 deletion asf_search/Products/S1Product.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ class S1Product(ASFStackableProduct):
def __init__(self, args: Dict = {}, session: ASFSession = ASFSession()):
super().__init__(args, session)

self.properties['s3Urls'] = self._get_s3_urls()
self.properties['s3Urls'] = self._get_s3_uris()

if self.has_baseline():
self.baseline = self.get_baseline_calc_properties()
Expand Down
3 changes: 1 addition & 2 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,7 @@
packages=find_packages(exclude=['tests.*', 'tests', 'examples.*', 'examples']),
package_dir={'asf_search': 'asf_search'},
include_package_data=True,
python_requires='>=3.8',
python_requires='>=3.9',
install_requires=requirements,
extras_require={'test': test_requirements, 'extras': extra_requirements},
license='BSD',
Expand All @@ -60,7 +60,6 @@
'Intended Audience :: Science/Research',
'Programming Language :: Python :: 3',
'Programming Language :: Python :: 3 :: Only',
'Programming Language :: Python :: 3.8',
'Programming Language :: Python :: 3.9',
'Programming Language :: Python :: 3.10',
'Programming Language :: Python :: 3.11',
Expand Down
68 changes: 61 additions & 7 deletions tests/pytest-managers.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,6 @@
from typing import Dict, List
from asf_search import (
ASFSearchOptions,
ASFSession,
FileDownloadType,
)
from asf_search import ASFSearchOptions, ASFSession, FileDownloadType, search

from asf_search.exceptions import ASFAuthenticationError

from ASFProduct.test_ASFProduct import (
Expand Down Expand Up @@ -481,8 +478,8 @@ def test_validator_map_validate(**args) -> None:
run_test_validator_map_validate(key, value, output)


def test_ASFSearchOptions_validator(**args) -> None:
test_info = args['test_info']
def test_ASFSearchOptions_validator(**kargs) -> None:
test_info = kargs['test_info']
validator_name = get_resource(test_info['validator'])
param = safe_load_tuple(get_resource(test_info['input']))
output = safe_load_tuple(get_resource(test_info['output']))
Expand All @@ -494,6 +491,63 @@ def test_ASFSearchOptions(**kwargs) -> None:
run_test_ASFSearchOptions(**kwargs)


def test_ASFSearchResults_get_urls() -> None:
response = search(
granule_list=[
'OPERA_L2_RTC-S1_T131-279916-IW1_20231202T162856Z_20231202T232622Z_S1A_30_v1.0'
]
)
actual_urls = response.find_urls()

expected_urls = [
'https://cumulus.asf.alaska.edu/s3credentials',
'https://datapool.asf.alaska.edu/BROWSE/OPERA-S1/OPERA_L2_RTC-S1_T131-279916-IW1_20231202T162856Z_20231202T232622Z_S1A_30_v1.0_BROWSE.png',
'https://datapool.asf.alaska.edu/BROWSE/OPERA-S1/OPERA_L2_RTC-S1_T131-279916-IW1_20231202T162856Z_20231202T232622Z_S1A_30_v1.0_BROWSE.png.md5',
'https://datapool.asf.alaska.edu/BROWSE/OPERA-S1/OPERA_L2_RTC-S1_T131-279916-IW1_20231202T162856Z_20231202T232622Z_S1A_30_v1.0_BROWSE_low-res.png',
'https://datapool.asf.alaska.edu/BROWSE/OPERA-S1/OPERA_L2_RTC-S1_T131-279916-IW1_20231202T162856Z_20231202T232622Z_S1A_30_v1.0_BROWSE_low-res.png.md5',
'https://datapool.asf.alaska.edu/BROWSE/OPERA-S1/OPERA_L2_RTC-S1_T131-279916-IW1_20231202T162856Z_20231202T232622Z_S1A_30_v1.0_BROWSE_thumbnail.png',
'https://datapool.asf.alaska.edu/BROWSE/OPERA-S1/OPERA_L2_RTC-S1_T131-279916-IW1_20231202T162856Z_20231202T232622Z_S1A_30_v1.0_BROWSE_thumbnail.png.md5',
'https://datapool.asf.alaska.edu/RTC/OPERA-S1/OPERA_L2_RTC-S1_T131-279916-IW1_20231202T162856Z_20231202T232622Z_S1A_30_v1.0.h5',
'https://datapool.asf.alaska.edu/RTC/OPERA-S1/OPERA_L2_RTC-S1_T131-279916-IW1_20231202T162856Z_20231202T232622Z_S1A_30_v1.0.h5.md5',
'https://datapool.asf.alaska.edu/RTC/OPERA-S1/OPERA_L2_RTC-S1_T131-279916-IW1_20231202T162856Z_20231202T232622Z_S1A_30_v1.0.iso.xml',
'https://datapool.asf.alaska.edu/RTC/OPERA-S1/OPERA_L2_RTC-S1_T131-279916-IW1_20231202T162856Z_20231202T232622Z_S1A_30_v1.0.iso.xml.md5',
'https://datapool.asf.alaska.edu/RTC/OPERA-S1/OPERA_L2_RTC-S1_T131-279916-IW1_20231202T162856Z_20231202T232622Z_S1A_30_v1.0_VH.tif',
'https://datapool.asf.alaska.edu/RTC/OPERA-S1/OPERA_L2_RTC-S1_T131-279916-IW1_20231202T162856Z_20231202T232622Z_S1A_30_v1.0_VH.tif.md5',
'https://datapool.asf.alaska.edu/RTC/OPERA-S1/OPERA_L2_RTC-S1_T131-279916-IW1_20231202T162856Z_20231202T232622Z_S1A_30_v1.0_VV.tif',
'https://datapool.asf.alaska.edu/RTC/OPERA-S1/OPERA_L2_RTC-S1_T131-279916-IW1_20231202T162856Z_20231202T232622Z_S1A_30_v1.0_VV.tif.md5',
'https://datapool.asf.alaska.edu/RTC/OPERA-S1/OPERA_L2_RTC-S1_T131-279916-IW1_20231202T162856Z_20231202T232622Z_S1A_30_v1.0_mask.tif',
'https://datapool.asf.alaska.edu/RTC/OPERA-S1/OPERA_L2_RTC-S1_T131-279916-IW1_20231202T162856Z_20231202T232622Z_S1A_30_v1.0_mask.tif.md5',
]
assert actual_urls == expected_urls
assert response.find_urls('.tif') == [
'https://datapool.asf.alaska.edu/RTC/OPERA-S1/OPERA_L2_RTC-S1_T131-279916-IW1_20231202T162856Z_20231202T232622Z_S1A_30_v1.0_VH.tif',
'https://datapool.asf.alaska.edu/RTC/OPERA-S1/OPERA_L2_RTC-S1_T131-279916-IW1_20231202T162856Z_20231202T232622Z_S1A_30_v1.0_VV.tif',
'https://datapool.asf.alaska.edu/RTC/OPERA-S1/OPERA_L2_RTC-S1_T131-279916-IW1_20231202T162856Z_20231202T232622Z_S1A_30_v1.0_mask.tif',
]
assert response.find_urls(pattern='.*s3credentials') == [
'https://cumulus.asf.alaska.edu/s3credentials'
]
assert response.find_urls('.tif', directAccess=True) == [
's3://asf-cumulus-prod-opera-products/OPERA_L2_RTC-S1/OPERA_L2_RTC-S1_T131-279916-IW1_20231202T162856Z_20231202T232622Z_S1A_30_v1.0/OPERA_L2_RTC-S1_T131-279916-IW1_20231202T162856Z_20231202T232622Z_S1A_30_v1.0_VH.tif',
's3://asf-cumulus-prod-opera-products/OPERA_L2_RTC-S1/OPERA_L2_RTC-S1_T131-279916-IW1_20231202T162856Z_20231202T232622Z_S1A_30_v1.0/OPERA_L2_RTC-S1_T131-279916-IW1_20231202T162856Z_20231202T232622Z_S1A_30_v1.0_VV.tif',
's3://asf-cumulus-prod-opera-products/OPERA_L2_RTC-S1/OPERA_L2_RTC-S1_T131-279916-IW1_20231202T162856Z_20231202T232622Z_S1A_30_v1.0/OPERA_L2_RTC-S1_T131-279916-IW1_20231202T162856Z_20231202T232622Z_S1A_30_v1.0_mask.tif',
]

assert response.find_urls(pattern=r'.*BROWSE.*') == [
'https://datapool.asf.alaska.edu/BROWSE/OPERA-S1/OPERA_L2_RTC-S1_T131-279916-IW1_20231202T162856Z_20231202T232622Z_S1A_30_v1.0_BROWSE.png',
'https://datapool.asf.alaska.edu/BROWSE/OPERA-S1/OPERA_L2_RTC-S1_T131-279916-IW1_20231202T162856Z_20231202T232622Z_S1A_30_v1.0_BROWSE.png.md5',
'https://datapool.asf.alaska.edu/BROWSE/OPERA-S1/OPERA_L2_RTC-S1_T131-279916-IW1_20231202T162856Z_20231202T232622Z_S1A_30_v1.0_BROWSE_low-res.png',
'https://datapool.asf.alaska.edu/BROWSE/OPERA-S1/OPERA_L2_RTC-S1_T131-279916-IW1_20231202T162856Z_20231202T232622Z_S1A_30_v1.0_BROWSE_low-res.png.md5',
'https://datapool.asf.alaska.edu/BROWSE/OPERA-S1/OPERA_L2_RTC-S1_T131-279916-IW1_20231202T162856Z_20231202T232622Z_S1A_30_v1.0_BROWSE_thumbnail.png',
'https://datapool.asf.alaska.edu/BROWSE/OPERA-S1/OPERA_L2_RTC-S1_T131-279916-IW1_20231202T162856Z_20231202T232622Z_S1A_30_v1.0_BROWSE_thumbnail.png.md5',
]
assert response.find_urls('.png', pattern=r'.*BROWSE.*') == [
'https://datapool.asf.alaska.edu/BROWSE/OPERA-S1/OPERA_L2_RTC-S1_T131-279916-IW1_20231202T162856Z_20231202T232622Z_S1A_30_v1.0_BROWSE.png',
'https://datapool.asf.alaska.edu/BROWSE/OPERA-S1/OPERA_L2_RTC-S1_T131-279916-IW1_20231202T162856Z_20231202T232622Z_S1A_30_v1.0_BROWSE_low-res.png',
'https://datapool.asf.alaska.edu/BROWSE/OPERA-S1/OPERA_L2_RTC-S1_T131-279916-IW1_20231202T162856Z_20231202T232622Z_S1A_30_v1.0_BROWSE_thumbnail.png',
]


def test_ASFSearchResults_intersection(**kwargs) -> None:
wkt = get_resource(kwargs['test_info']['wkt'])
run_test_ASFSearchResults_intersection(wkt)
Expand Down

0 comments on commit 1414e2e

Please sign in to comment.