Skip to content

Commit

Permalink
Add support for exporting to duckdb (via parquet) (#157)
Browse files Browse the repository at this point in the history
* Add support for exporting to duckdb (via parquet)

This patch adds functionality to export directly to a DuckDB database
via the --duckdb flag or using a ".duckdb" or ".db" file. Optionally one
can change the table name in which data will be imported.

Documentation was mostly copied from existing functions but doctests
were updated and checked for consistency with the results.

Closes #94

* Fix open mypy issues

* Fix doctest errors; fix filter_osm_ids typing properly

This patch fixes remaining doctest errors that occured during
#157. Meanwhile, a remaining bug was discovered around
the typing of filter_osm_ids. It was solved, too.

* Resolve refurb: immut. tuple over list for suffix

* Fix typo in test_cli.py

* Fix test_cli.py for duckdb: correct file, split args

* chore: add new test case for increased coverage

* chore: add author to the changelog

* feat: add automatic directory generation for duckdb export

* chore: refactor cli codebase

* fix: remove commas

* chore: change working directory test

---------

Co-authored-by: Kamil Raczycki <raczyckikamil@gmail.com>
  • Loading branch information
mwip and RaczeQ authored Oct 9, 2024
1 parent 201a199 commit 61dd66f
Show file tree
Hide file tree
Showing 7 changed files with 903 additions and 15 deletions.
4 changes: 4 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,10 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0

## [Unreleased]

### Added

- Option to export to DuckDB database [#94](https://github.com/kraina-ai/quackosm/issues/119) (implemented by [@mwip](https://github.com/mwip))

## [0.11.0] - 2024-09-24

### Changed
Expand Down
6 changes: 6 additions & 0 deletions quackosm/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,10 +6,13 @@
"""

from quackosm.functions import (
convert_geometry_to_duckdb,
convert_geometry_to_geodataframe,
convert_geometry_to_parquet,
convert_osm_extract_to_duckdb,
convert_osm_extract_to_geodataframe,
convert_osm_extract_to_parquet,
convert_pbf_to_duckdb,
convert_pbf_to_geodataframe,
convert_pbf_to_parquet,
)
Expand All @@ -23,8 +26,11 @@
__all__ = [
"PbfFileReader",
"convert_pbf_to_parquet",
"convert_pbf_to_duckdb",
"convert_geometry_to_parquet",
"convert_geometry_to_duckdb",
"convert_osm_extract_to_parquet",
"convert_osm_extract_to_duckdb",
"convert_pbf_to_geodataframe",
"convert_geometry_to_geodataframe",
"convert_osm_extract_to_geodataframe",
Expand Down
129 changes: 119 additions & 10 deletions quackosm/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -500,11 +500,31 @@ def main(
"--output",
"-o",
help=(
"Path where to save final geoparquet file. If not provided, it will be generated"
"Path where to save final result file. If not provided, it will be generated"
" automatically based on the input pbf file name."
" Can be [bold green].parquet[/bold green] or"
" [bold green].db[/bold green] or [bold green].duckdb[/bold green] extension."
),
),
] = None,
duckdb: Annotated[
bool,
typer.Option(
"--duckdb",
help=(
"Export to duckdb database. If not provided, data can still be exported if"
" [bold bright_cyan]output[/bold bright_cyan] has [bold green].db[/bold green]"
" or [bold green].duckdb[/bold green] extension."
),
),
] = False,
duckdb_table_name: Annotated[
Optional[str],
typer.Option(
"--duckdb-table-name",
help="Table name which the data will be imported into in the DuckDB database.",
),
] = "quackosm",
ignore_cache: Annotated[
bool,
typer.Option(
Expand Down Expand Up @@ -687,11 +707,21 @@ def main(
verbosity_mode = "silent"

logging.disable(logging.CRITICAL)
if pbf_file:

is_duckdb = (result_file_path and result_file_path.suffix in (".duckdb", ".db")) or duckdb

pbf_file_parquet = pbf_file and not is_duckdb
pbf_file_duckdb = pbf_file and is_duckdb
osm_extract_parquet = osm_extract_query and not is_duckdb
osm_extract_duckdb = osm_extract_query and is_duckdb
geometry_parquet = not pbf_file and not osm_extract_query and not is_duckdb
geometry_duckdb = not pbf_file and not osm_extract_query and is_duckdb

if pbf_file_parquet:
from quackosm.functions import convert_pbf_to_parquet

geoparquet_path = convert_pbf_to_parquet(
pbf_path=pbf_file,
result_path = convert_pbf_to_parquet(
pbf_path=cast(str, pbf_file),
tags_filter=osm_tags_filter or osm_tags_filter_file, # type: ignore
keep_all_tags=keep_all_tags,
geometry_filter=geometry_filter_value,
Expand All @@ -708,13 +738,34 @@ def main(
save_as_wkt=wkt_result,
verbosity_mode=verbosity_mode,
)
elif osm_extract_query:
elif pbf_file_duckdb:
from quackosm.functions import convert_pbf_to_duckdb

result_path = convert_pbf_to_duckdb(
pbf_path=cast(str, pbf_file),
tags_filter=osm_tags_filter or osm_tags_filter_file, # type: ignore
keep_all_tags=keep_all_tags,
geometry_filter=geometry_filter_value,
explode_tags=explode_tags,
ignore_cache=ignore_cache,
working_directory=working_directory,
result_file_path=result_file_path,
osm_way_polygon_features_config=(
json.loads(Path(osm_way_polygon_features_config).read_text())
if osm_way_polygon_features_config
else None
),
filter_osm_ids=filter_osm_ids, # type: ignore
duckdb_table_name=duckdb_table_name or "quackosm",
verbosity_mode=verbosity_mode,
)
elif osm_extract_parquet:
from quackosm._exceptions import OsmExtractSearchError
from quackosm.functions import convert_osm_extract_to_parquet

try:
geoparquet_path = convert_osm_extract_to_parquet(
osm_extract_query=osm_extract_query,
result_path = convert_osm_extract_to_parquet(
osm_extract_query=cast(str, osm_extract_query),
osm_extract_source=osm_extract_source,
tags_filter=osm_tags_filter or osm_tags_filter_file, # type: ignore
keep_all_tags=keep_all_tags,
Expand All @@ -738,10 +789,64 @@ def main(
err_console = Console(stderr=True)
err_console.print(ex)
raise typer.Exit(code=1) from None
else:
elif osm_extract_duckdb:
from quackosm._exceptions import OsmExtractSearchError
from quackosm.functions import convert_osm_extract_to_duckdb

try:
result_path = convert_osm_extract_to_duckdb(
osm_extract_query=cast(str, osm_extract_query),
osm_extract_source=osm_extract_source,
tags_filter=osm_tags_filter or osm_tags_filter_file, # type: ignore
keep_all_tags=keep_all_tags,
geometry_filter=geometry_filter_value,
explode_tags=explode_tags,
ignore_cache=ignore_cache,
working_directory=working_directory,
result_file_path=result_file_path,
osm_way_polygon_features_config=(
json.loads(Path(osm_way_polygon_features_config).read_text())
if osm_way_polygon_features_config
else None
),
filter_osm_ids=filter_osm_ids, # type: ignore
duckdb_table_name=duckdb_table_name or "quackosm",
save_as_wkt=wkt_result,
verbosity_mode=verbosity_mode,
)
except OsmExtractSearchError as ex:
from rich.console import Console

err_console = Console(stderr=True)
err_console.print(ex)
raise typer.Exit(code=1) from None
elif geometry_parquet:
from quackosm.functions import convert_geometry_to_parquet

geoparquet_path = convert_geometry_to_parquet(
result_path = convert_geometry_to_parquet(
geometry_filter=geometry_filter_value,
osm_extract_source=osm_extract_source,
tags_filter=osm_tags_filter or osm_tags_filter_file, # type: ignore
keep_all_tags=keep_all_tags,
explode_tags=explode_tags,
ignore_cache=ignore_cache,
working_directory=working_directory,
result_file_path=result_file_path,
osm_way_polygon_features_config=(
json.loads(Path(osm_way_polygon_features_config).read_text())
if osm_way_polygon_features_config
else None
),
filter_osm_ids=filter_osm_ids, # type: ignore
save_as_wkt=wkt_result,
verbosity_mode=verbosity_mode,
geometry_coverage_iou_threshold=geometry_coverage_iou_threshold,
allow_uncovered_geometry=allow_uncovered_geometry,
)
elif geometry_duckdb:
from quackosm.functions import convert_geometry_to_duckdb

result_path = convert_geometry_to_duckdb(
geometry_filter=geometry_filter_value,
osm_extract_source=osm_extract_source,
tags_filter=osm_tags_filter or osm_tags_filter_file, # type: ignore
Expand All @@ -756,9 +861,13 @@ def main(
else None
),
filter_osm_ids=filter_osm_ids, # type: ignore
duckdb_table_name=duckdb_table_name or "quackosm",
save_as_wkt=wkt_result,
verbosity_mode=verbosity_mode,
geometry_coverage_iou_threshold=geometry_coverage_iou_threshold,
allow_uncovered_geometry=allow_uncovered_geometry,
)
typer.secho(geoparquet_path, fg="green")
else:
raise RuntimeError("Unknown operation mode")

typer.secho(result_path, fg="green")
Loading

0 comments on commit 61dd66f

Please sign in to comment.