From f2d40c9be6f5e62ed85af483716dae2001ef0b66 Mon Sep 17 00:00:00 2001 From: Chris Riccomini Date: Thu, 16 Feb 2023 21:09:54 -0800 Subject: [PATCH] Rewrite Recap This commit is pretty much a complete rewrite. I was really unhappy with how complicated things had gettin with Recap, especially the Python API. I've rewritten things to add: * A very simple REPL. * A FastAPI-like metadata crawling API * A basic data catalog * A basic crawler * A storage layer with a graph-like API These changes make it much easier to work with Recap in Python. It also lays the groundwork for complex schema conversion features that I want to write. There's way too much to document in this commit message, so see the updated docs for more information. --- README.md | 107 ++++-- docs/api/recap.analyzers.md | 1 - docs/api/recap.browsers.md | 1 - docs/api/recap.catalog.md | 1 + docs/api/recap.catalogs.md | 1 - docs/api/recap.crawler.md | 1 - docs/api/recap.integrations.md | 1 + docs/api/recap.metadata.md | 1 + docs/api/recap.paths.md | 1 - docs/api/recap.repl.md | 1 + docs/api/recap.storage.md | 1 + docs/cli.md | 2 +- docs/guides/configuration.md | 39 +- docs/guides/logging.md | 4 +- docs/guides/plugins.md | 103 ------ docs/index.md | 101 ++++- docs/quickstart.md | 202 ++++------ docs/rest.md | 57 +-- mkdocs.yml | 11 +- pdm.lock | 225 ++++++------ pyproject.toml | 41 +-- recap/analyzers/__init__.py | 17 - recap/analyzers/abstract.py | 23 -- recap/analyzers/frictionless/__init__.py | 0 recap/analyzers/frictionless/columns.py | 79 ---- recap/analyzers/sqlalchemy/__init__.py | 0 recap/analyzers/sqlalchemy/columns.py | 57 --- recap/browsers/__init__.py | 30 -- recap/browsers/abstract.py | 41 --- recap/browsers/analyzing.py | 134 ------- recap/browsers/db.py | 178 --------- recap/browsers/fs.py | 68 ---- recap/catalog.py | 413 +++++++++++++++++++++ recap/catalogs/__init__.py | 26 -- recap/catalogs/abstract.py | 95 ----- recap/catalogs/db.py | 446 ----------------------- recap/catalogs/recap.py | 136 ------- recap/cli.py | 221 ++++++++++- recap/commands/__init__.py | 0 recap/commands/catalog.py | 102 ------ recap/commands/crawl.py | 92 ----- recap/commands/plugins.py | 51 --- recap/commands/serve.py | 39 -- recap/config.py | 36 +- recap/crawler.py | 263 ++++--------- recap/integrations/__init__.py | 4 + recap/integrations/bigquery.py | 299 +++++++++++++++ recap/integrations/fsspec.py | 93 +++++ recap/integrations/sqlalchemy.py | 105 ++++++ recap/logging.py | 11 +- recap/metadata.py | 69 ++-- recap/plugins.py | 126 ------- recap/registry.py | 147 ++++++++ recap/repl.py | 167 +++++++++ recap/routers/__init__.py | 0 recap/routers/catalog.py | 77 ---- recap/schemas/__init__.py | 0 recap/schemas/schema.py | 23 -- recap/server.py | 92 ++++- recap/storage/__init__.py | 16 + recap/storage/abstract.py | 136 +++++++ recap/storage/db.py | 248 +++++++++++++ recap/storage/remote.py | 120 ++++++ recap/url.py | 80 ---- tests/catalogs/test_db.py | 153 -------- tests/storage/test_db.py | 81 ++++ 66 files changed, 2672 insertions(+), 2824 deletions(-) delete mode 100644 docs/api/recap.analyzers.md delete mode 100644 docs/api/recap.browsers.md create mode 100644 docs/api/recap.catalog.md delete mode 100644 docs/api/recap.catalogs.md delete mode 100644 docs/api/recap.crawler.md create mode 100644 docs/api/recap.integrations.md create mode 100644 docs/api/recap.metadata.md delete mode 100644 docs/api/recap.paths.md create mode 100644 docs/api/recap.repl.md create mode 100644 docs/api/recap.storage.md delete mode 100644 docs/guides/plugins.md delete mode 100644 recap/analyzers/__init__.py delete mode 100644 recap/analyzers/abstract.py delete mode 100644 recap/analyzers/frictionless/__init__.py delete mode 100644 recap/analyzers/frictionless/columns.py delete mode 100644 recap/analyzers/sqlalchemy/__init__.py delete mode 100644 recap/analyzers/sqlalchemy/columns.py delete mode 100644 recap/browsers/__init__.py delete mode 100644 recap/browsers/abstract.py delete mode 100644 recap/browsers/analyzing.py delete mode 100644 recap/browsers/db.py delete mode 100644 recap/browsers/fs.py create mode 100644 recap/catalog.py delete mode 100644 recap/catalogs/__init__.py delete mode 100644 recap/catalogs/abstract.py delete mode 100644 recap/catalogs/db.py delete mode 100644 recap/catalogs/recap.py delete mode 100644 recap/commands/__init__.py delete mode 100644 recap/commands/catalog.py delete mode 100644 recap/commands/crawl.py delete mode 100644 recap/commands/plugins.py delete mode 100644 recap/commands/serve.py create mode 100644 recap/integrations/__init__.py create mode 100644 recap/integrations/bigquery.py create mode 100644 recap/integrations/fsspec.py create mode 100644 recap/integrations/sqlalchemy.py delete mode 100644 recap/plugins.py create mode 100644 recap/registry.py create mode 100644 recap/repl.py delete mode 100644 recap/routers/__init__.py delete mode 100644 recap/routers/catalog.py delete mode 100644 recap/schemas/__init__.py delete mode 100644 recap/schemas/schema.py create mode 100644 recap/storage/__init__.py create mode 100644 recap/storage/abstract.py create mode 100644 recap/storage/db.py create mode 100644 recap/storage/remote.py delete mode 100644 recap/url.py delete mode 100644 tests/catalogs/test_db.py create mode 100644 tests/storage/test_db.py diff --git a/README.md b/README.md index 8c897227..dbba44e9 100644 --- a/README.md +++ b/README.md @@ -4,7 +4,7 @@

-A dead simple data catalog for engineers +A metadata toolkit written in Python

@@ -14,40 +14,101 @@ pylint

-## About +# About -Recap makes it easy for engineers to build infrastructure and tools that need metadata. Unlike traditional data catalogs, Recap is designed to power software. Read [Recap: A Data Catalog for People Who Hate Data Catalogs](https://cnr.sh/essays/recap-for-people-who-hate-data-catalogs) to learn more. +Recap is a Python library that helps you build tools for data quality, data goverenance, data profiling, data lineage, data contracts, and schema conversion. ## Features -* Supports major cloud data warehouses and Postgres -* No external system dependencies required -* Designed for the [CLI](https://docs.recap.cloud/latest/cli/) -* Runs as a [Python API](https://docs.recap.cloud/latest/api/recap.analyzers/) or [REST API](https://docs.recap.cloud/latest/rest/) -* Fully [pluggable](https://docs.recap.cloud/latest/guides/plugins/) +* Compatible with [fsspec](https://filesystem-spec.readthedocs.io/en/latest/) filesystems and [SQLAlchemy](https://www.sqlalchemy.org) databases. +* Built-in support for [Parquet](https://parquet.apache.org), CSV, TSV, and JSON files. +* Includes [Pandas](https://pandas.pydata.org) for data profiling. +* Uses [Pydantic](https://pydantic.dev) for metadata models. +* Convenient [CLI](cli.md), [Python API](api/recap.analyzers.md), and [REST API](rest.md) +* No external system dependencies. ## Installation pip install recap-core -## Commands +## Usage -* `recap catalog list` - List a data catalog directory. -* `recap catalog read` - Read metadata from the data catalog. -* `recap catalog search` - Search the data catalog for metadata. -* `recap crawl` - Crawl infrastructure and save metadata in the data catalog. -* `recap plugins analyzers` - List all analyzer plugins. -* `recap plugins browsers` - List all browser plugins. -* `recap plugins catalogs` - List all catalog plugins. -* `recap plugins commands` - List all command plugins. -* `recap serve` - Start Recap's REST API. +Grab schemas from filesystems: -## Getting Started +```python +schema("s3://corp-logs/2022-03-01/0.json") +``` + +And databases: + +```python +schema("snowflake://ycbjbzl-ib10693/TEST_DB/PUBLIC/311_service_requests") +``` + +In a standardized format: + +```json +{ + "fields": [ + { + "name": "unique_key", + "type": "VARCHAR", + "nullable": false, + "comment": "The service request tracking number." + }, + { + "name": "complaint_description", + "type": "VARCHAR", + "nullable": true, + "comment": "Service request type" + } + ] +} +``` + +See what schemas used to look like: + +```python +schema("snowflake://ycbjbzl-ib10693/TEST_DB/PUBLIC/311_service_requests", datetime(2023, 1, 1)) +``` + +Build metadata extractors: + +```python +@registry.metadata("s3://{path:path}.json", include_df=True) +@registry.metadata("bigquery://{project}/{dataset}/{table}", include_df=True) +def pandas_describe(df: DataFrame, *_) -> BaseModel: + description_dict = df.describe(include="all") + return PandasDescription.parse_obj(description_dict) +``` -See the [Quickstart](https://docs.recap.cloud/latest/quickstart/) page to get started. +Crawl your data: -## Warning +```python +crawl("s3://corp-logs") +crawl("bigquery://floating-castle-728053") +``` -> ⚠️ This package is still under development and may not be stable. The API may break at any time. +And read the results: + +```python +search("json_extract(metadata_obj, '$.count') > 9999", PandasDescription) +``` + +See where data comes from: + +```python +writers("bigquery://floating-castle-728053/austin_311/311_service_requests") +``` + +And where it's going: + +```python +readers("bigquery://floating-castle-728053/austin_311/311_service_requests") +``` + +All cached in Recap's catalog. + +## Getting Started -Recap is still a little baby application. It's going to wake up crying in the middle of the night. It's going to vomit on the floor once in a while. But if you give it some love and care, it'll be worth it. As time goes on, it'll grow up and be more mature. Bear with it. +See the [Quickstart](quickstart.md) page to get started. diff --git a/docs/api/recap.analyzers.md b/docs/api/recap.analyzers.md deleted file mode 100644 index f0008055..00000000 --- a/docs/api/recap.analyzers.md +++ /dev/null @@ -1 +0,0 @@ -::: recap.analyzers diff --git a/docs/api/recap.browsers.md b/docs/api/recap.browsers.md deleted file mode 100644 index efebbf6a..00000000 --- a/docs/api/recap.browsers.md +++ /dev/null @@ -1 +0,0 @@ -::: recap.browsers diff --git a/docs/api/recap.catalog.md b/docs/api/recap.catalog.md new file mode 100644 index 00000000..ec11ab61 --- /dev/null +++ b/docs/api/recap.catalog.md @@ -0,0 +1 @@ +::: recap.catalog diff --git a/docs/api/recap.catalogs.md b/docs/api/recap.catalogs.md deleted file mode 100644 index 38d2d4a0..00000000 --- a/docs/api/recap.catalogs.md +++ /dev/null @@ -1 +0,0 @@ -::: recap.catalogs diff --git a/docs/api/recap.crawler.md b/docs/api/recap.crawler.md deleted file mode 100644 index 0ecadd18..00000000 --- a/docs/api/recap.crawler.md +++ /dev/null @@ -1 +0,0 @@ -::: recap.crawler diff --git a/docs/api/recap.integrations.md b/docs/api/recap.integrations.md new file mode 100644 index 00000000..80582edd --- /dev/null +++ b/docs/api/recap.integrations.md @@ -0,0 +1 @@ +::: recap.integrations diff --git a/docs/api/recap.metadata.md b/docs/api/recap.metadata.md new file mode 100644 index 00000000..5436d5ad --- /dev/null +++ b/docs/api/recap.metadata.md @@ -0,0 +1 @@ +::: recap.metadata diff --git a/docs/api/recap.paths.md b/docs/api/recap.paths.md deleted file mode 100644 index acde62a2..00000000 --- a/docs/api/recap.paths.md +++ /dev/null @@ -1 +0,0 @@ -::: recap.paths diff --git a/docs/api/recap.repl.md b/docs/api/recap.repl.md new file mode 100644 index 00000000..fd9d4b20 --- /dev/null +++ b/docs/api/recap.repl.md @@ -0,0 +1 @@ +::: recap.repl diff --git a/docs/api/recap.storage.md b/docs/api/recap.storage.md new file mode 100644 index 00000000..633614f9 --- /dev/null +++ b/docs/api/recap.storage.md @@ -0,0 +1 @@ +::: recap.storage diff --git a/docs/cli.md b/docs/cli.md index c06b8a76..83e953da 100644 --- a/docs/cli.md +++ b/docs/cli.md @@ -1,6 +1,6 @@ # Recap CLI -Execute Recap's CLI using the `recap` command. Recap's CLI is pluggable, so the `recap` command will have subcommands for each plugin you've installed. By default, Recap ships with the following command plugins. +Execute Recap's CLI using the `recap` command. The CLI allows you to crawl, search, and read metadata from live systems (using `--refresh`) and Recap's catalog. ::: mkdocs-typer :module: recap.cli diff --git a/docs/guides/configuration.md b/docs/guides/configuration.md index 91ee292d..0b231e29 100644 --- a/docs/guides/configuration.md +++ b/docs/guides/configuration.md @@ -1,31 +1,28 @@ -Though Recap's CLI can run without any configuration, you might want to configure Recap using a config file. Recap uses [Dynaconf](https://www.dynaconf.com/) for its configuration system. +Though Recap's CLI can run without any configuration, you might want to configure Recap. Recap uses Pydantic's [BaseSettings](https://docs.pydantic.dev/usage/settings/) class for its configuration system. -## Config Locations +## Configs -Configuraton is stored in `~/.recap/settings.toml` by default. You can override the default location by setting the `SETTINGS_FILE_FOR_DYNACONF` environment variable. See Dynaconf's [documentation](https://www.dynaconf.com/configuration/#envvar) for more information. +See Recap's [config.py](https://github.com/recap-cloud/recap/blob/main/recap/config.py) for all available configuration parameters. -## Schema +Commonly set environment variables include: -Recap's `settings.toml` has two main sections: `catalog` and `crawlers`. +```bash +RECAP_STORAGE_SETTINGS__URL=http://localhost:8000/storage +RECAP_LOGGING_CONFIG_FILE=/tmp/logging.toml +``` -* The `catalog` section configures the storage layer; it uses SQLite by default. Run `recap plugins catalogs` to see other options. -* The `crawlers` section defines infrastructure to crawl. Only the `url` field is required. You may optionally specify analyzer `excludes` and path `filters` as well. +!!! note -```toml -[catalog] -plugin = "recap" -url = "http://localhost:8000" + Note the double-underscore (_dunder_) in the `URL` environment variable. This is a common way to set nested dictionary and object values in Pydantic's `BaseSettings` classes. You can also set JSON objects like `RECAP_STORAGE_SETTINGS='{"url": "http://localhost:8000/storage"}'`. See Pydantic's [settings management](https://docs.pydantic.dev/usage/settings/) page for more information. -[[crawlers]] -url = "postgresql://username@localhost/some_db" -excludes = [ - "sqlalchemy.profile" -] -filters = [ - "/**/tables/some_table" -] -``` +## Dotenv + +Recap supports [.env](https://www.dotenv.org) files to manage environment variables. Simply create a `.env` in your current working directory and use Recap as usual. Pydantic handles the rest. + +## Home + +RECAP_HOME defines where Recap looks for storage and secret files. By default, RECAP_HOME is set to `~/.recap`. ## Secrets -Do not store database credentials in your `settings.toml`; use Dynaconf's secret management instead. See Dynaconf's [documentation](https://www.dynaconf.com/secrets/) for more information. \ No newline at end of file +You can set environment variables with secrets in them using Pydantic's [secret handling mechanism](https://docs.pydantic.dev/usage/settings/#secret-support). By default, Recap looks for secrets in `$RECAP_HOME/.secrets`. diff --git a/docs/guides/logging.md b/docs/guides/logging.md index 43610cc9..6dcb59c4 100644 --- a/docs/guides/logging.md +++ b/docs/guides/logging.md @@ -2,7 +2,7 @@ Recap uses Python's standard [logging](https://docs.python.org/3/library/logging ## Customizing -You can customize Recap's log output. Set the `logging.config.path` value in your `settings.toml` file to point at a [TOML](https://toml.io) file that conforms to Python's [dictConfig](https://docs.python.org/3/library/logging.config.html#logging-config-dictschema) schema. +You can customize Recap's log output. Set the `RECAP_LOGGING_CONFIG_FILE` environment variable to point to a [TOML](https://toml.io) file that conforms to Python's [dictConfig](https://docs.python.org/3/library/logging.config.html#logging-config-dictschema) schema. ```toml version = 1 @@ -30,4 +30,4 @@ propagate = false handlers = ['default'] level = "INFO" propagate = false -``` \ No newline at end of file +``` diff --git a/docs/guides/plugins.md b/docs/guides/plugins.md deleted file mode 100644 index 6600dbb3..00000000 --- a/docs/guides/plugins.md +++ /dev/null @@ -1,103 +0,0 @@ -You can extend Recap with plugins. In fact, everything in Recap is a plugin except for its crawler (and even that might change eventually). - -Plugins are implemented using Pythons `entry-points` package metadata. See Python's [using pacakge metadata](https://packaging.python.org/en/latest/guides/creating-and-discovering-plugins/#using-package-metadata) page for more details on this style of plugin architecture. - -There are five types of plugins: - -* Analyzers -* Browsers -* Catalogs -* Commands -* Routers - -## Analyzers - -[Analyzer plugins](analyzers.md) must implement the [AbstractAnalyzer](https://github.com/recap-cloud/recap/blob/main/recap/analyzers/abstract.py) class. - -Packages can export their analyzers using the `recap.analyzers` entry-point. Here's how Recap's built-in analyzers are defined in its [pyproject.toml](https://github.com/recap-cloud/recap/blob/main/pyproject.toml): - -```toml -[project.entry-points."recap.analyzers"] -"db.location" = "recap.analyzers.db.location" -"sqlalchemy.access" = "recap.analyzers.sqlalchemy.access" -"sqlalchemy.columns" = "recap.analyzers.sqlalchemy.columns" -"sqlalchemy.comment" = "recap.analyzers.sqlalchemy.comment" -"sqlalchemy.foreign_keys" = "recap.analyzers.sqlalchemy.foreign_keys" -"sqlalchemy.indexes" = "recap.analyzers.sqlalchemy.indexes" -"sqlalchemy.primary_key" = "recap.analyzers.sqlalchemy.primary_key" -"sqlalchemy.profile" = "recap.analyzers.sqlalchemy.profile" -"sqlalchemy.view_definition" = "recap.analyzers.sqlalchemy.view_definition" -``` - -Every entry-point points to a module with a `create_analyzer(**config)` method. - -## Browsers - -[Browser plugins](browsers.md) must implement the [AbstractBrowser](https://github.com/recap-cloud/recap/blob/main/recap/browsers/abstract.py) class. - -Packages can export their browsers using the `recap.browsers` entry-point. Here's how Recap's built-in browser is defined in its [pyproject.toml](https://github.com/recap-cloud/recap/blob/main/pyproject.toml): - -```toml -[project.entry-points."recap.browsers"] -db = "recap.browsers.db" -``` - -Every entry-point points to a module with a `create_browser(**config)` method. - -## Catalogs - -[Catalog plugins](catalogs.md) must implement the [AbstractCatalog](https://github.com/recap-cloud/recap/blob/main/recap/catalogs/abstract.py) class. - -Packages can export their catalogs using the `recap.catalogs` entry-point. Here's how Recap's built-in catalogs are defined in its [pyproject.toml](https://github.com/recap-cloud/recap/blob/main/pyproject.toml): - -```toml -[project.entry-points."recap.catalogs"] -db = "recap.catalogs.db" -recap = "recap.catalogs.recap" -``` - -Every entry-point points to a module with a `create_catalog(**config)` method. - -## Commands - -[Command plugins](commands.md) use [Typer](https://typer.tiangolo.com/). Plugins must expose a `typer.Typer()` object, usually defined as: - -```python -app = typer.Typer() -``` - -Packages can export their commands using the `recap.commands` entry-point. Here's how Recap's built-in commands are defined in its [pyproject.toml](https://github.com/recap-cloud/recap/blob/main/pyproject.toml): - -```toml -[project.entry-points."recap.commands"] -catalog = "recap.commands.catalog:app" -crawl = "recap.commands.crawl:app" -plugins = "recap.commands.plugins:app" -serve = "recap.commands.serve:app" -``` - -## Routers - -[Server plugins](server.md) use [FastAPI](https://fastapi.tiangolo.com/). Plugins must expose a `fastapi.APIRouter()` object, usually defined as: - -```python -router = fastapi.APIRouter() -``` - -Packages can export their commands using the `recap.routers` entry-point. Here's how Recap's built-in routers are defined in its [pyproject.toml](https://github.com/recap-cloud/recap/blob/main/pyproject.toml): - -```toml -[project.entry-points."recap.routers"] -"catalog.typed" = "recap.routers.catalog.typed:router" -"catalog.untyped" = "recap.routers.catalog.untyped:router" -``` - -Routers are added relative to the HTTP server's root path. - -!!! tip - - Recap calls `include_router(route)` for each object in the `recap.routers` entry-point. This means that anything that works with `include_router` can be exposed (even [GraphQL APIs](https://fastapi.tiangolo.com/advanced/graphql/)). - -!!! warning - - [Order matters](https://fastapi.tiangolo.com/tutorial/path-params/#order-matters) when adding routers to FastAPI. Recap does not currently support router order prioritization; routers are added in an unpredictable order. If multiple routers contain the same path, the first one will handle incoming requests to its path. diff --git a/docs/index.md b/docs/index.md index 82b7999d..c96a9c91 100644 --- a/docs/index.md +++ b/docs/index.md @@ -1,30 +1,97 @@ # Recap -Recap makes it easy for engineers to build infrastructure and tools that need metadata. Unlike traditional data catalogs, Recap is designed to power software. Read [Recap: A Data Catalog for People Who Hate Data Catalogs](https://cnr.sh/essays/recap-for-people-who-hate-data-catalogs) to learn more. +Recap is a Python library that helps you build tools for data quality, data goverenance, data profiling, data lineage, data contracts, and schema conversion. ## Features -* Supports major cloud data warehouses and PostgreSQL -* No external system dependencies required -* Designed for the [CLI](cli.md) -* Runs as a [Python API](api/recap.analyzers.md) or [REST API](rest.md) -* Fully [pluggable](guides/plugins.md) +* Compatible with [fsspec](https://filesystem-spec.readthedocs.io/en/latest/) filesystems and [SQLAlchemy](https://www.sqlalchemy.org) databases. +* Built-in support for [Parquet](https://parquet.apache.org), CSV, TSV, and JSON files. +* Includes [Pandas](https://pandas.pydata.org) for data profiling. +* Uses [Pydantic](https://pydantic.dev) for metadata models. +* Convenient [CLI](cli.md), [Python API](api/recap.analyzers.md), and [REST API](rest.md) +* No external system dependencies. ## Installation pip install recap-core -## Commands - -* `recap catalog list` - List a data catalog directory. -* `recap catalog read` - Read metadata from the data catalog. -* `recap catalog search` - Search the data catalog for metadata. -* `recap crawl` - Crawl infrastructure and save metadata in the data catalog. -* `recap plugins analyzers` - List all analyzer plugins. -* `recap plugins browsers` - List all browser plugins. -* `recap plugins catalogs` - List all catalog plugins. -* `recap plugins commands` - List all command plugins. -* `recap serve` - Start Recap's REST API. +## Usage + +Grab schemas from filesystems: + +```python +schema("s3://corp-logs/2022-03-01/0.json") +``` + +And databases: + +```python +schema("snowflake://ycbjbzl-ib10693/TEST_DB/PUBLIC/311_service_requests") +``` + +In a standardized format: + +```json +{ + "fields": [ + { + "name": "unique_key", + "type": "VARCHAR", + "nullable": false, + "comment": "The service request tracking number." + }, + { + "name": "complaint_description", + "type": "VARCHAR", + "nullable": true, + "comment": "Service request type" + } + ] +} +``` + +See what schemas used to look like: + +```python +schema("snowflake://ycbjbzl-ib10693/TEST_DB/PUBLIC/311_service_requests", datetime(2023, 1, 1)) +``` + +Build metadata extractors: + +```python +@registry.metadata("s3://{path:path}.json", include_df=True) +@registry.metadata("bigquery://{project}/{dataset}/{table}", include_df=True) +def pandas_describe(df: DataFrame, *_) -> BaseModel: + description_dict = df.describe(include="all") + return PandasDescription.parse_obj(description_dict) +``` + +Crawl your data: + +```python +crawl("s3://corp-logs") +crawl("bigquery://floating-castle-728053") +``` + +And read the results: + +```python +search("json_extract(metadata_obj, '$.count') > 9999", PandasDescription) +``` + +See where data comes from: + +```python +writers("bigquery://floating-castle-728053/austin_311/311_service_requests") +``` + +And where it's going: + +```python +readers("bigquery://floating-castle-728053/austin_311/311_service_requests") +``` + +All cached in Recap's catalog. ## Getting Started diff --git a/docs/quickstart.md b/docs/quickstart.md index eea32b59..86fbbe1b 100644 --- a/docs/quickstart.md +++ b/docs/quickstart.md @@ -1,6 +1,6 @@ ## Install -Start by installing Recap. Python 3.10 or above is required. +Start by installing Recap. Python 3.10 is required. pip install recap-core @@ -8,48 +8,59 @@ Start by installing Recap. Python 3.10 or above is required. Now let's crawl a database: - === "CLI" - recap crawl postgresql://username@localhost/some_db + ``` + recap crawl postgresql://username@localhost/some_db + ``` === "Python" ```python - from recap.analyzers.db.column import TableColumnAnalyzer - from recap.browsers.db import DatabaseBrowser - from recap.catalogs.db import DatabaseCatalog - from recap.crawler import Crawler - from sqlalchemy import create_engine - - some_db_engine = create_engine('postgresql://username@localhost/some_db') - catalog_engine = create_engine('sqlite://') - analyzers = [ - TableColumnAnalyzer(some_db_engine), - # Other analyzers can go here, too. - ] - browser = DatabaseBrowser(some_db_engine) - catalog = DatabaseCatalog(catalog_engine) - crawler = Crawler(browser, catalog, analyzers) - crawler.crawl() + from recap.repl import * + + crawl("postgresql://username@localhost/some_db") ``` You can use any [SQLAlchemy](https://docs.sqlalchemy.org/en/14/dialects/) connect string. +=== "CLI" + + ``` recap crawl bigquery://some-project-12345 - recap crawl snowflake://username:password@account_identifier/SOME_DB/SOME_SCHHEMA?warehouse=SOME_COMPUTE + recap crawl snowflake://username:password@account_identifier/SOME_DB/SOME_SCHEMA?warehouse=SOME_COMPUTE + ``` -!!! note +=== "Python" + + ```python + from recap.repl import * - You must install appropriate drivers and [SQLAlchemy dialects](https://docs.sqlalchemy.org/en/14/dialects/) for the databases you wish to crawl. For PostgreSQL, you'll have to `pip install psycopg2`. For Snowflake and BigQuery, you'll have to `pip install snowflake-sqlalchemy` or `pip install sqlalchemy-bigquery`, respectively. + crawl("bigquery://some-project-12345") + crawl("snowflake://username:password@account_identifier/SOME_DB/SOME_SCHEMA?warehouse=SOME_COMPUTE") + ``` + +!!! note -Or maybe you want to crawl a filesystem: + You must install appropriate drivers and [SQLAlchemy dialects](https://docs.sqlalchemy.org/en/14/dialects/) for the databases you wish to crawl. For PostgreSQL, you'll have to `pip install psycopg2`. For Snowflake and BigQuery, you'll have to `pip install snowflake-sqlalchemy` and `pip install sqlalchemy-bigquery`, respectively. - recap crawl ~/data +You can also crawl filesystems and object stores. -Or object store: +=== "CLI" + ``` + recap crawl /tmp/data recap crawl s3://power-analysis-ready-datastore + ``` + +=== "Python" + + ```python + from recap.repl import * + + crawl("/tmp/data") + crawl("s3://power-analysis-ready-datastore") + ``` !!! note @@ -61,135 +72,73 @@ Crawled metadata is stored in a directory structure. See what's available using: === "CLI" - recap catalog list / + recap ls /tmp/data === "Python" ```python - from recap.catalogs.db import DatabaseCatalog - from sqlalchemy import create_engine + from recap.repl import * - engine = create_engine('sqlite://') - catalog = DatabaseCatalog(engine) - children = catalog.ls('/') + ls("/tmp/data") ``` Recap will respond with a JSON list in the CLI: ```json [ - "databases", - "filesystems" + "file:///tmp/data/foo", + "file:///tmp/data/bar.json" ] ``` -Append children to the path to browse around: +## Read + +After you poke around, try and read some metadata. === "CLI" - recap catalog list /databases + recap schema file:///tmp/data/foo.json === "Python" ```python - from recap.catalogs.db import DatabaseCatalog - from sqlalchemy import create_engine + from recap.repl import * - engine = create_engine('sqlite://') - catalog = DatabaseCatalog(engine) - results = catalog.ls('/databases') + schema("/tmp/data/foo.json") ``` -## Read +Recap will print `foo.json`'s inferred schema to the CLI in JSON format: + +```json +{ + "fields": [ + { + "name": "test", + "type": "string", + "default": null, + "nullable": null, + "comment": null + } + ] +} +``` + +## Time -After you poke around, try and read some metadata. Every node in the path can have metadata, but right now only tables and views do. You can look at metadata using the `recap catalog read` command: +Recap keeps historical data. You can set the `time` parameter to see what data looked like at specific point in time. This is useful for debugging data quality issues. === "CLI" - recap catalog read /databases/postgresql/instances/localhost/schemas/some_db/tables/some_table + recap schema file:///tmp/data/foo.json --time 2020-02-22 === "Python" ```python - from recap.catalogs.db import DatabaseCatalog - from sqlalchemy import create_engine + from recap.repl import * - engine = create_engine('sqlite://') - catalog = DatabaseCatalog(engine) - metadata = catalog.read('/databases/postgresql/instances/localhost/schemas/some_db/tables/some_table') + schema("/tmp/data/foo.json", datetime(2022, 2, 22)) ``` -Recap will print all of `some_table`'s metadata to the CLI in JSON format: - -```json -{ - "sqlalchemy.access": { - "username": { - "privileges": [ - "INSERT", - "SELECT", - "UPDATE", - "DELETE", - "TRUNCATE", - "REFERENCES", - "TRIGGER" - ], - "read": true, - "write": true - } - }, - "sqlalchemy.columns": { - "email": { - "autoincrement": false, - "default": null, - "generic_type": "VARCHAR", - "nullable": false, - "type": "VARCHAR" - }, - "id": { - "autoincrement": true, - "default": "nextval('\"some_db\".some_table_id_seq'::regclass)", - "generic_type": "BIGINT", - "nullable": false, - "type": "BIGINT" - } - }, - "sqlalchemy.profile": { - "email": { - "count": 10, - "distinct": 10, - "empty_strings": 0, - "max_length": 32, - "min_length": 13, - "nulls": 0 - }, - "id": { - "average": 5.5, - "count": 10, - "max": 10, - "min": 1, - "negatives": 0, - "nulls": 0, - "sum": 55.0, - "zeros": 0 - } - }, - "sqlalchemy.indexes": { - "index_some_table_on_email": { - "columns": [ - "email" - ], - "unique": false - } - }, - "sqlalchemy.primary_key": { - "constrained_columns": [ - "id" - ], - "name": "some_table_pkey" - } -} -``` ## Search @@ -197,17 +146,18 @@ Recap stores its metadata in [SQLite](https://www.sqlite.org/) by default. You c === "CLI" - recap catalog search "json_extract(metadata, '$.\"sqlalchemy.columns\".some_col') IS NOT NULL" + recap search schema "json_extract(metadata_obj, '$.fields') IS NOT NULL" === "Python" ```python - from recap.catalogs.db import DatabaseCatalog - from sqlalchemy import create_engine + from recap.repl import * - engine = create_engine('sqlite://') - catalog = DatabaseCatalog(engine) - results = catalog.search("json_extract(metadata, '$.\"sqlalchemy.columns\".some_col') IS NOT NULL") + search("json_extract(metadata_obj, '$.fields') IS NOT NULL") ``` -The database file defaults to `~/.recap/catalog/recap.db`, if you wish to open a SQLite client directly. +The database file defaults to `~/.recap/recap.db`, if you wish to open a SQLite client directly. + +## Integrations + +See the [Integrations](api/recap.integrations.md) page to see all of the systems Recap supports, and what data you can crawl. diff --git a/docs/rest.md b/docs/rest.md index 5556801a..25b89f82 100644 --- a/docs/rest.md +++ b/docs/rest.md @@ -1,4 +1,4 @@ -Recap comes with an HTTP/JSON API server implementation of Recap's [catalog](catalogs.md) interface. The server allows non-Python systems to integrate with Recap. The server also allows metadata to be shared amgonst different systems when they all point to the same server. +Recap comes with an HTTP/JSON API server implementation of Recap's [storage](api/storage.md) interface. The server allows non-Python systems to integrate with Recap, and different systems to share their metadata when they all point to the same server. ## Starting @@ -6,67 +6,48 @@ Execute `recap serve` to start Recap's server. ## Configuring -Recap's server is implemented as a [FastAPI](https://fastapi.tiangolo.com/) in a [uvicorn](https://www.uvicorn.org/) web server. Any configuration set in your `settings.toml` file under the `server.uvicorn` namespace will be passed to Recap's `uvicorn.run` invocation. +Recap's server is implemented as a [FastAPI](https://fastapi.tiangolo.com/) in a [uvicorn](https://www.uvicorn.org/) web server. You can configure uvicorn using the `RECAP_UVICORN_SETTINGS` environment variable. -```toml -[server] -uvicorn.port = 9000 -``` - -You can also enable only certain [router plugins](plugins.md#routers) using the `server.plugins` setting: - -```toml -[server] -plugins = [ - "catalog.typed" -] +```bash +RECAP_UVICORN_SETTINGS='{"port": 9000}' ``` ## Endpoints -Recap has the following endpoints: - -* GET `/catalog?query=` - Search the Recap catalog. -* GET | PUT | PATCH `/catalog/{path}/metadata` - Read, write, or remove metadata. -* GET `/catalog/{path}/children` - List a path's child directories. -* DELETE `/catalog/{path}` - Delete a path, its metadata, and its children. - Recap's JSON schema is visible at [http://localhost:8000/openapi.json](http://localhost:8000/openapi.json). API documentation is also visible at [http://localhost:8000/docs](http://localhost:8000/docs) and [http://localhost:8000/redoc](http://localhost:8000/redoc). -## Models - -Recap's catalog [router plugin](plugins.md#routers) comes in two flavors: typed and untyped. The [endpoints](server.md#endpoints) listed above are untyped--metadata is a `dict[str, Any]`. Typed endpoints use [Pydantic](https://pydantic.dev) data models from [analyzers](analyzers.md) to define strongly typed (and validated) metadata models. - -Recap's server turns on both routers by default. Interacting with typed paths will validate metadata JSON. Interacting with untyped paths will allow arbitrary JSON dictionaries through. A typed path looks something like this: - - /catalog/databases/{scheme}/instances/{name}/schemas/{schema}/views/{view}/metadata - ## Examples The following examples illlustrate how to call a Recap server running at [http://localhost:8000](http://localhost:8000). -### Write metadata +### Write a Schema ```bash -curl -X PATCH 'http://localhost:8000/catalog/databases/postgresql/instances/some_instance/schemas/some_db/tables/some_table/metadata' \ - -d '{"comment": "This is some table"}' \ +curl -X PUT 'http://localhost:8000/storage/postgresql://localhost/some_db/some_schema/some_table/metadata/schema' \ + -d '{"fields": [{"name": "test"}]}' \ -H "Content-Type: application/json" ``` -### Read metadata +### Read a Schema + +```bash +curl 'http://localhost:8000/storage/postgresql://localhost/some_db/some_schema/some_table/metadata/schema' +``` + +### Write a Relationship ```bash -curl 'http://localhost:8000/catalog/databases/postgresql/instances/some_instance/schemas/some_db/tables/some_table/metadata' +curl -X POST 'http://localhost:8000/storage/postgresql://localhost/some_db/some_schema/links/contains?relationship=contains&other_url=http://localhost:8000/storage/postgresql://localhost/some_db/some_schema/some_table' ``` -### Read a directory +### Read a Relationship ```bash -curl http://localhost:8000/catalog/databases/postgresql/children +curl 'http://localhost:8000/storage/postgresql://localhost/some_db/some_schema/links/contains' ``` -### Delete a directory +### Delete a Relationship ```bash -curl -X DELETE 'http://localhost:8000/catalog/databases/postgresql/instances/some_instance/schemas/some_db/tables/some_table' +curl -X DELETE 'http://localhost:8000/storage/postgresql://localhost/some_db/some_schema/links/contains?relationship=contains&other_url=http://localhost:8000/storage/postgresql://localhost/some_db/some_schema/some_table' ``` diff --git a/mkdocs.yml b/mkdocs.yml index a037a15f..c77b59ed 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -11,13 +11,12 @@ nav: - Guides: - Configuration: guides/configuration.md - Logging: guides/logging.md - - Plugins: guides/plugins.md - Python API: - - recap.analyzers: api/recap.analyzers.md - - recap.browsers: api/recap.browsers.md - - recap.catalogs: api/recap.catalogs.md - - recap.crawler: api/recap.crawler.md - - recap.paths: api/recap.paths.md + - recap.catalog: api/recap.catalog.md + - recap.integrations: api/recap.integrations.md + - recap.storage: api/recap.storage.md + - recap.metadata: api/recap.metadata.md + - recap.repl: api/recap.repl.md - REST API: rest.md - Recap CLI: cli.md theme: diff --git a/pdm.lock b/pdm.lock index 58cca3cb..1c57ef29 100644 --- a/pdm.lock +++ b/pdm.lock @@ -170,12 +170,6 @@ dependencies = [ "cffi>=1.12", ] -[[package]] -name = "dacite" -version = "1.8.0" -requires_python = ">=3.6" -summary = "Simple creation of data classes from dictionaries." - [[package]] name = "decorator" version = "5.1.1" @@ -188,12 +182,6 @@ version = "0.3.6" requires_python = ">=3.7" summary = "serialize all of python" -[[package]] -name = "dynaconf" -version = "3.1.11" -requires_python = ">=3.7" -summary = "The dynamic configurator for your Python Project" - [[package]] name = "exceptiongroup" version = "1.1.0" @@ -212,7 +200,7 @@ dependencies = [ [[package]] name = "fastparquet" -version = "2023.1.0" +version = "2023.2.0" requires_python = ">=3.8" summary = "Python support for Parquet file format" dependencies = [ @@ -231,7 +219,7 @@ summary = "A platform independent file lock." [[package]] name = "frictionless" -version = "5.5.3" +version = "5.6.3" summary = "Data management framework for Python that provides functionality to describe, extract, validate, and transform tabular data" dependencies = [ "attrs>=21.3.0", @@ -258,12 +246,12 @@ dependencies = [ [[package]] name = "frictionless" -version = "5.5.3" +version = "5.6.3" extras = ["json", "parquet"] summary = "Data management framework for Python that provides functionality to describe, extract, validate, and transform tabular data" dependencies = [ "fastparquet>=0.8", - "frictionless==5.5.3", + "frictionless==5.6.3", "ijson>=3.0", "jsonlines>=1.2", ] @@ -375,12 +363,12 @@ dependencies = [ [[package]] name = "google-cloud-bigquery" -version = "3.5.0" +version = "3.6.0" requires_python = ">=3.7" summary = "Google BigQuery API client library" dependencies = [ "google-api-core[grpc]!=2.0.*,!=2.1.*,!=2.2.*,!=2.3.0,<3.0.0dev,>=1.31.5", - "google-cloud-core<3.0.0dev,>=1.4.1", + "google-cloud-core<3.0.0dev,>=1.6.0", "google-resumable-media<3.0dev,>=0.6.0", "grpcio<2.0dev,>=1.47.0", "packaging>=20.0.0", @@ -518,7 +506,7 @@ dependencies = [ [[package]] name = "humanize" -version = "4.5.0" +version = "4.6.0" requires_python = ">=3.7" summary = "Python humanize utilities" @@ -880,13 +868,24 @@ summary = "Cryptographic library for Python" [[package]] name = "pydantic" -version = "1.10.4" +version = "1.10.5" requires_python = ">=3.7" summary = "Data validation and settings management using python type hints" dependencies = [ "typing-extensions>=4.2.0", ] +[[package]] +name = "pydantic" +version = "1.10.5" +extras = ["dotenv"] +requires_python = ">=3.7" +summary = "Data validation and settings management using python type hints" +dependencies = [ + "pydantic==1.10.5", + "python-dotenv>=0.10.4", +] + [[package]] name = "pygments" version = "2.14.0" @@ -980,7 +979,7 @@ summary = "Read key-value pairs from a .env file and set them as environment var [[package]] name = "python-slugify" -version = "8.0.0" +version = "8.0.1" requires_python = ">=3.7" summary = "A Python slugify application that also handles Unicode" dependencies = [ @@ -1094,7 +1093,7 @@ summary = "Tool to Detect Surrounding Shell" [[package]] name = "simpleeval" -version = "0.9.12" +version = "0.9.13" summary = "A simple, safe single expression evaluator library." [[package]] @@ -1331,7 +1330,7 @@ dependencies = [ [metadata] lock_version = "4.1" -content_hash = "sha256:5276c531843ea109692d7ad0f02395130f0ce42b4b2d4ebbc706b35a7024a73f" +content_hash = "sha256:3fc85d0b9ca9fcc7147e63294829b8566be88b42e6f68f21f5ad54863a2c1bcd" [metadata.files] "aiobotocore 2.4.2" = [ @@ -1669,10 +1668,6 @@ content_hash = "sha256:5276c531843ea109692d7ad0f02395130f0ce42b4b2d4ebbc706b35a7 {url = "https://files.pythonhosted.org/packages/e3/3f/41186b1f2fd86a542d399175f6b8e43f82cd4dfa51235a0b030a042b811a/cryptography-38.0.4.tar.gz", hash = "sha256:175c1a818b87c9ac80bb7377f5520b7f31b3ef2a0004e2420319beadedb67290"}, {url = "https://files.pythonhosted.org/packages/fb/28/0544f67e2ffdc15874d7a650a867c78a7dba245afe3392f51cfae363545c/cryptography-38.0.4-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:80ca53981ceeb3241998443c4964a387771588c4e4a5d92735a493af868294f9"}, ] -"dacite 1.8.0" = [ - {url = "https://files.pythonhosted.org/packages/6f/6d/f7ee0f5410665cdfbd56d0caf5da9217410348e5a0c11d3e6cfe1c1ddd7a/dacite-1.8.0.tar.gz", hash = "sha256:6257a5e505b61a8cafee7ef3ad08cf32ee9b885718f42395d017e0a9b4c6af65"}, - {url = "https://files.pythonhosted.org/packages/ec/5f/d922519ac948491915ab5b3270a88b0f517a003e6b12af5d71bf9ecc93a7/dacite-1.8.0-py3-none-any.whl", hash = "sha256:f7b1205cc5d9b62835aac8cbc1e6e37c1da862359a401f1edbe2ae08fbdc6193"}, -] "decorator 5.1.1" = [ {url = "https://files.pythonhosted.org/packages/66/0c/8d907af351aa16b42caae42f9d6aa37b900c67308052d10fdce809f8d952/decorator-5.1.1.tar.gz", hash = "sha256:637996211036b6385ef91435e4fae22989472f9d571faba8927ba8253acbc330"}, {url = "https://files.pythonhosted.org/packages/d5/50/83c593b07763e1161326b3b8c6686f0f4b0f24d5526546bee538c89837d6/decorator-5.1.1-py3-none-any.whl", hash = "sha256:b8c3f85900b9dc423225913c5aace94729fe1fa9763b38939a95226f02d37186"}, @@ -1681,10 +1676,6 @@ content_hash = "sha256:5276c531843ea109692d7ad0f02395130f0ce42b4b2d4ebbc706b35a7 {url = "https://files.pythonhosted.org/packages/7c/e7/364a09134e1062d4d5ff69b853a56cf61c223e0afcc6906b6832bcd51ea8/dill-0.3.6.tar.gz", hash = "sha256:e5db55f3687856d8fbdab002ed78544e1c4559a130302693d839dfe8f93f2373"}, {url = "https://files.pythonhosted.org/packages/be/e3/a84bf2e561beed15813080d693b4b27573262433fced9c1d1fea59e60553/dill-0.3.6-py3-none-any.whl", hash = "sha256:a07ffd2351b8c678dfc4a856a3005f8067aea51d6ba6c700796a4d9e280f39f0"}, ] -"dynaconf 3.1.11" = [ - {url = "https://files.pythonhosted.org/packages/54/6f/09c3ca2943314e0cae5cb2eeca1b77f5968855e13d6fdaae32c8e055eb7c/dynaconf-3.1.11.tar.gz", hash = "sha256:d9cfb50fd4a71a543fd23845d4f585b620b6ff6d9d3cc1825c614f7b2097cb39"}, - {url = "https://files.pythonhosted.org/packages/e7/67/0600b5fbd27928c112a11a5018b3e9ebf3d7f896d510c3c231f8d09886ae/dynaconf-3.1.11-py2.py3-none-any.whl", hash = "sha256:87e0b3b12b5db9e8fb465e1f8c7fdb926cd2ec5b6d88aa7f821f316df93fb165"}, -] "exceptiongroup 1.1.0" = [ {url = "https://files.pythonhosted.org/packages/15/ab/dd27fb742b19a9d020338deb9ab9a28796524081bca880ac33c172c9a8f6/exceptiongroup-1.1.0.tar.gz", hash = "sha256:bcb67d800a4497e1b404c2dd44fca47d3b7a5e5433dbab67f96c1a685cdfdf23"}, {url = "https://files.pythonhosted.org/packages/e8/14/9c6a7e5f12294ccd6975a45e02899ed25468cd7c2c86f3d9725f387f9f5f/exceptiongroup-1.1.0-py3-none-any.whl", hash = "sha256:327cbda3da756e2de031a3107b81ab7b3770a602c4d16ca618298c526f4bec1e"}, @@ -1693,48 +1684,48 @@ content_hash = "sha256:5276c531843ea109692d7ad0f02395130f0ce42b4b2d4ebbc706b35a7 {url = "https://files.pythonhosted.org/packages/09/b3/d259ee31dfe35e583a629f39b16047a9afe455694403bf51510981414721/fastapi-0.89.1.tar.gz", hash = "sha256:15d9271ee52b572a015ca2ae5c72e1ce4241dd8532a534ad4f7ec70c376a580f"}, {url = "https://files.pythonhosted.org/packages/8f/89/adf4525d1870021b65ec562e83e9f46d96494ad95f238d0264ef1ab6b604/fastapi-0.89.1-py3-none-any.whl", hash = "sha256:f9773ea22290635b2f48b4275b2bf69a8fa721fda2e38228bed47139839dc877"}, ] -"fastparquet 2023.1.0" = [ - {url = "https://files.pythonhosted.org/packages/01/82/f763c8b962e34afe8447938d57d2dddd58704bc34bb098f7a39bc3d01016/fastparquet-2023.1.0-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:3d138a35979d72e4e2e1c06a6f275ea8b8885d1484e791fa7ad148af3aca8878"}, - {url = "https://files.pythonhosted.org/packages/07/df/773ebeb49765b61732182ffecd8e3841eac2f975fb86aa6f57b169d6f939/fastparquet-2023.1.0-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:72fcd440472a4acfda2ab2007c2c23de37bce33ad4c609ab095aeb00012e699c"}, - {url = "https://files.pythonhosted.org/packages/09/11/0c990b6018c8354c86fe9508a9110790cd3e0e67ba6f7e392f1b38715b5c/fastparquet-2023.1.0-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:74ebaff8f4f7922f44953161770c44a88b61dccd3cc11393f20856e34c3cf05c"}, - {url = "https://files.pythonhosted.org/packages/09/1d/3ac895547d35913ca415c590e5e49b09f3c327acb12aeecf1a6f0937e477/fastparquet-2023.1.0-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:201b05ececa2e2e607230039cea6f9e0027837e8e273c8ad83886f10699bc9c9"}, - {url = "https://files.pythonhosted.org/packages/09/4f/988d44a910a6386f92d3c4b338e2ce83fdac0d8a09806025a1600fcaf016/fastparquet-2023.1.0-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:3feb1758b7b746e92d7aef64a013a0402a5919ff0147803276bc40e102141815"}, - {url = "https://files.pythonhosted.org/packages/13/bc/dd198efd89cc93fe88bc13b8311b893abee3338ce43f4d40f559a9e16b83/fastparquet-2023.1.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1096fdebb87a9630b69bd7c68185783a337d01c1cd24916b1489ecb82b55cefb"}, - {url = "https://files.pythonhosted.org/packages/19/85/58404653154672486730e229aecb266d96dc19fbb8507be0dd0af8d38c4c/fastparquet-2023.1.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:c00c47cce430204f4e7c007f84e420feada5676a6e752e093ca039cab5fa7370"}, - {url = "https://files.pythonhosted.org/packages/1a/0c/1dc5cf3c330407f731fadf9f57ce4480fab42d6475e4e3b6b4093118dd5e/fastparquet-2023.1.0-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:fafd22c2a799ae9f3fcc6c1763d2480da3d47199beb6c8667b04d688a5507905"}, - {url = "https://files.pythonhosted.org/packages/1d/57/ca4b91822829ccd93b8db8a5db75183aed871e0291d8612b8f3fdf2af230/fastparquet-2023.1.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:00347f09a060852ce4330ce678c638977faf6fdb5c29caf89ad5651e0f0d7621"}, - {url = "https://files.pythonhosted.org/packages/25/34/9a2e0e88b72ae47526b88bca11b21cd995f8c070c4d6bab323f74365420e/fastparquet-2023.1.0-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:02e0619a86e9e328373cbfb22fceb8e4054b6d32badffb565ff21d7a3566ed38"}, - {url = "https://files.pythonhosted.org/packages/27/8d/78cb48b493b52802619774177500ab3efe2e21afe73943365bc5d7bab31a/fastparquet-2023.1.0-cp38-cp38-win_amd64.whl", hash = "sha256:76dd48cb568c4596baded551251f870a3690a43893e29653baf26062549b82b3"}, - {url = "https://files.pythonhosted.org/packages/34/21/bc9b1ddec23e8393f50ddeb8d5dc991867c342b75a34c436bf59240bf020/fastparquet-2023.1.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:993079d95120ab234b7bfae200c3b7f56b16df4e284c62353a466dbfce951d23"}, - {url = "https://files.pythonhosted.org/packages/48/19/f77aa6e385bd0334b8c7813a34b370e0802046e2be74a0a631c095a639c1/fastparquet-2023.1.0-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:c3a1ae4dbd079bc4b195249a0791a187c45b9b1802af947167c8d76a01cd8a79"}, - {url = "https://files.pythonhosted.org/packages/49/66/3740dd7582dca2f271f1c34b50aec6b3e75f6e4b36a51e79b7ca27f87437/fastparquet-2023.1.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:4ebcaa49b57d4f11112160e80f3feab1a36af68072e415672da985930c66c3a2"}, - {url = "https://files.pythonhosted.org/packages/50/de/0aba2473e4c9d0a79920d696b26b3c73edf09b0fb0c46f5ac88021d0f43a/fastparquet-2023.1.0-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:b8256c56bea62d43fd26307f68fd2ad281a1b21478b64a94bb94a01681a97583"}, - {url = "https://files.pythonhosted.org/packages/55/6e/100b64a9ef61b17486f41249f8a03341a1e649f7b7a9bec2ec61cee98689/fastparquet-2023.1.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:09e2bcca0d95867b0364637a02d032844a496a47c2a2926e007a126e2bc25f55"}, - {url = "https://files.pythonhosted.org/packages/57/7d/9c13b9a67e337fb8b5b4e2051641bdb13e926bcfd318d181b97553ca208a/fastparquet-2023.1.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:98065a55bfbedddcc237a791109ea9b3ac3e8008318e4c8e7b39227219494e4b"}, - {url = "https://files.pythonhosted.org/packages/58/08/040ce055f4857a2fbbe8f2ce1ceecd975751f09df8cc8efa5f973bc14f65/fastparquet-2023.1.0-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:851fa21b1df421d8acadfd10025d7721c46c2182d4a64cef9a3811fa4a25a2eb"}, - {url = "https://files.pythonhosted.org/packages/5b/18/e7aba247e1481d670697d087631b3d27493489a13502bf5f18c293193066/fastparquet-2023.1.0-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:cb3c6406e086db3bf5835a62e46626111928e50bad5bfe56e63d40d293303be1"}, - {url = "https://files.pythonhosted.org/packages/67/6e/d372358bb20901c82ff546ea1449c9e6e034552c8f93d0b73d527ec034e3/fastparquet-2023.1.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:3bcf1e969a42f8dabedca2cb255e7649d0725eafebf1e897450d84af504a5c70"}, - {url = "https://files.pythonhosted.org/packages/74/24/664687395c2e9f193103445763548da1c099ea8848d36c98373763e98793/fastparquet-2023.1.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:97b978b90037d312d673dfd2e2c17cca85c692eaa9373f44856b1d5ed48a8cec"}, - {url = "https://files.pythonhosted.org/packages/78/70/424e5ac86f57c37a60069d29b41c67942f6d4656de08246008b889be22dc/fastparquet-2023.1.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:cec14b87d5f721ba85e0fc0797a9adfb751d8e501863b5c587da09c2e65f2095"}, - {url = "https://files.pythonhosted.org/packages/7c/41/0bcacc9f66060f94e9a382726ce73dea074327f6a5fcda75479954df0dc3/fastparquet-2023.1.0-cp39-cp39-win_amd64.whl", hash = "sha256:11adc51b17af433db8486b9be959c806034d44184e073249bd3285db85dc768e"}, - {url = "https://files.pythonhosted.org/packages/85/c1/1121eef85efa6258dbb1d51df944894e810994ad500de62fab0a7291fd49/fastparquet-2023.1.0-cp311-cp311-win_amd64.whl", hash = "sha256:e873286445e0850a5f044c71b9c3f55279a1fbd7b7e39590c866f24de5ce850f"}, - {url = "https://files.pythonhosted.org/packages/9a/f0/c7b5a7c9ad52ddd281b7c27fa3ac7f840a5c476a19529282a075382b0ed0/fastparquet-2023.1.0-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:8fdfc1adcbc0ea1d05f9ac3576cf12732189c54e4b1c9d38da990dc36d9cc348"}, - {url = "https://files.pythonhosted.org/packages/a4/6a/dcb4ae7d42d508ff88266c24a82762167d46885cdf4ccdf522b8de5c9b99/fastparquet-2023.1.0.tar.gz", hash = "sha256:92252538823da2bf958d2f2edd14e3864ae296d28f5be24e07eb685b4b08bed2"}, - {url = "https://files.pythonhosted.org/packages/ae/d1/6dc0cc2ede895033cd87a522edd920d052844936c6e6852d85ee83481251/fastparquet-2023.1.0-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:c40fe744c478c64105dae97b1bdf10709c5f730f12fbeaa719a6714513c4eb7e"}, - {url = "https://files.pythonhosted.org/packages/c2/a0/91e3142725f4aa0aac9c84bcf6f335e0ca1e4939da608b63e932f3db61cb/fastparquet-2023.1.0-cp310-cp310-win_amd64.whl", hash = "sha256:a104fede9b113e079a9e480242de809b0eacb95d718d20c3a9e14a65cffd4031"}, - {url = "https://files.pythonhosted.org/packages/cf/84/c382cb188de46d88841a6e56f86d9436dde6804967fefa224def781c3d4c/fastparquet-2023.1.0-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:fc6af1f2b2f9c29f1e61097fa7a8adcbf568815dea787ed2d2590d1ec8467826"}, - {url = "https://files.pythonhosted.org/packages/d7/3d/27121152560920ae42c63f3f4c596680c6574464e467b86a98e3c3775442/fastparquet-2023.1.0-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:568406db0c7fc37179e468503221e526a4945e553d145fbf1f6344b5b3a8c8e6"}, - {url = "https://files.pythonhosted.org/packages/dc/55/45590ce8841f30d09bcde7e6a743f33d70404f7b3e9a8d02a4bf6acd16cc/fastparquet-2023.1.0-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:dea4af358ff2b55101d7708e9309283ec6dacd99d42b7060d79d5c1227bfa079"}, - {url = "https://files.pythonhosted.org/packages/e9/8a/6f7198406e2f7838ecbe92872190f4e9d419598f9002a4bfe94c12346ccb/fastparquet-2023.1.0-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:5efcb6e0280fe8e103e8a5f6bf4a5ecd32915d3f9959a4e85f64661c7cbecede"}, - {url = "https://files.pythonhosted.org/packages/f6/fe/5bab4c32a55831eb84f0dae22f94d26150c6c4e393b075f59158b64f9e9d/fastparquet-2023.1.0-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:0db7578d62945e4b9b6e983afc0f15fe9d82f47f76ebc3cdbd713c5fadd4ea84"}, +"fastparquet 2023.2.0" = [ + {url = "https://files.pythonhosted.org/packages/01/35/a89e737358df747857c743bcb3317fded3a05122f4c01610929eebba047b/fastparquet-2023.2.0-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:e3197bf76a0bf294c3da2f61fa6d830aeff13afd3f30dd4eaf6c430cc56a491d"}, + {url = "https://files.pythonhosted.org/packages/02/87/ce1ed202d9c2410f4eb690837c103782bb3b0c72c8436fd6b200628b31b1/fastparquet-2023.2.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f2a54e352777504faef9a1526191a7d9b77c306c11d901db1581512dcbb131fb"}, + {url = "https://files.pythonhosted.org/packages/09/2e/b8b8cc4acdc0ee8dfc3f36d8aa1726ec2fc32f1eff8495f91e7c5f6eb1fe/fastparquet-2023.2.0-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:58a1061f14fe8f85ed1c05055d915818eba3661d947dd9498ae1425319e502af"}, + {url = "https://files.pythonhosted.org/packages/0a/9d/c9f84c271fedd5cbc02995d34926c90108c56d228b2c0f51c3b92fff2f0b/fastparquet-2023.2.0-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:b0746e0d25bfdc78abdf380bc0c2e7f1731cba63c6019a8d1c9511394b5c8b41"}, + {url = "https://files.pythonhosted.org/packages/12/b0/0a35d91400c84522deecf6ef9dcaedcd24f20bf0625cf871df58f8a65ce6/fastparquet-2023.2.0-cp39-cp39-win_amd64.whl", hash = "sha256:54bf1a799b9323e424fc103b207f5228553162e5e4f4438baaeb5ebb4e68f891"}, + {url = "https://files.pythonhosted.org/packages/2a/a9/20ab6bb489e15595ac41d54ce951f94ef2c074812cc18a895d3c3b975001/fastparquet-2023.2.0-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:2171fb8bc0d7146224267c7d207aac9f6234e09ca580a3c89ccd6c4138431c6e"}, + {url = "https://files.pythonhosted.org/packages/32/19/d597129bc8c7dcf77a7c3e6d91f45c325ff8827d5f198bc0a76341c56d54/fastparquet-2023.2.0-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:95b40caa0cc19af832ee08f0b73bbb912442f2fbcc155370c455b33f5905ca11"}, + {url = "https://files.pythonhosted.org/packages/36/4d/c8367ab8ee1335fdb488980d488a1c0c1cc649eecb0b4159f777dabdfb03/fastparquet-2023.2.0-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:c0f73555d9dec670c014c2326a692c764d65ab97576a029bfa7c01bce665eb23"}, + {url = "https://files.pythonhosted.org/packages/46/87/89401b6813d5abbd3d6d4a77476716bf845c4f94bbe91aa91504ae290805/fastparquet-2023.2.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f56cd417341330ae48af864a337676b5eb6e3511588de7e090cef8f495631d04"}, + {url = "https://files.pythonhosted.org/packages/57/d4/e8c5807f292e0a2f5c4a225d4d07230d24d48f4beb77b1aef7b989990e76/fastparquet-2023.2.0-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:e48c1caf96efb749226cdca9ee5c08637e01bc7915451799bcbc5f98bf00c9da"}, + {url = "https://files.pythonhosted.org/packages/5d/3d/bc6bf1f3fb6e34ffc44e1876ab6e46a30b5330608d0f038f394158f5bf05/fastparquet-2023.2.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:faecb9ce2fefede6b8f16418c724df4b47842329bed43a0fe8df7675ff8efe05"}, + {url = "https://files.pythonhosted.org/packages/5d/9a/fdd9a2a027eb50f8d86424e13d4af04c4a7907c185b6559923445828bce9/fastparquet-2023.2.0-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:1c78a1f9a7c3b72b7d0276d0a7e300a34b2397198d4d11ece73dbe0a706afc17"}, + {url = "https://files.pythonhosted.org/packages/60/ba/c4b9ff20705484cf8c3aaed83d92974dcbcf3621b436fb232b264f23b35c/fastparquet-2023.2.0-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:fbe95709d77bcf1198ee1fba26fa34fd235e9efe3c9149f7c020b110e82a1b70"}, + {url = "https://files.pythonhosted.org/packages/66/34/50343ffaefd3db3314b7ca62fe9f77862e6eb6ca0f4916bfbd96ddd2fd3a/fastparquet-2023.2.0-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:337c0badf884261544d2bb9e1cdcdd2e25b6d0c5f1fd90b181d39ab2b86669ea"}, + {url = "https://files.pythonhosted.org/packages/6b/e2/e2d809a1461e20bfb1200528c0e9344aa1d9cef3e1fbe518004f47d78451/fastparquet-2023.2.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d0eeed04a83651081ee72ddd572a11aa1b86ceeb7efe74d97fa9cd30133c8064"}, + {url = "https://files.pythonhosted.org/packages/6c/66/7bef2707ce119cf9827c8f982a2bbf0c5dc40212c02caf12574a6d1ae3ee/fastparquet-2023.2.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:c27b6270a9053656945f1533aae8376a3171aced7a3ec36db185d30a86924aa4"}, + {url = "https://files.pythonhosted.org/packages/6d/57/544cfeb3e243b4dfcf4aaf569ac27f3e06394ee9e0a6ec13385a436d5edf/fastparquet-2023.2.0-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:679cc13841be1774338486bb55fcbadbed8de607cadb19377eebf57ad0be91cf"}, + {url = "https://files.pythonhosted.org/packages/6e/f3/3bb3750e28aa85a394efc78ce4dabf710d6b080fb9d6923270ac09c34c98/fastparquet-2023.2.0-cp310-cp310-win_amd64.whl", hash = "sha256:40b4af7c48b69f04444e544ac1bc80725e82bc2f015d9ba0488e3fcdb7e9b913"}, + {url = "https://files.pythonhosted.org/packages/73/e1/16edc9bb7228e7d638945dfb2135bfe11b9dc90f2e294e08ba0dc5611fbc/fastparquet-2023.2.0-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:7b564c98aee49b2f9f570be880954f6fbfdd65d3566a4992562731b3af12e528"}, + {url = "https://files.pythonhosted.org/packages/7e/5d/c58795e5550231ce84b39fec783e3f4836082f0162e2436227832716ee02/fastparquet-2023.2.0.tar.gz", hash = "sha256:7611447ce3ff5696539f7e43289da2491ea41f7cb92d4dbada374012b62c51c3"}, + {url = "https://files.pythonhosted.org/packages/81/8d/6f4157d567fb5e7c3396158a4e42c07916f19528eab92f2b3679f0c65f72/fastparquet-2023.2.0-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:7f0da977d4ebb5f98c9cb42ae410e17cc277086c3dde97bc080231afd89aa6f2"}, + {url = "https://files.pythonhosted.org/packages/85/d0/ea244a3267f203ff61a8ee0b9cc7d88d55411965656a26c4bd472daacb8b/fastparquet-2023.2.0-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:23d7e099c76284e2285b879f50cae36ab85b5e9247a2873798c98edd2a5cd630"}, + {url = "https://files.pythonhosted.org/packages/88/e5/d89160701654f5c021140670984ef5952de361e5414435c60b5295ac23b3/fastparquet-2023.2.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:c77266057461eafceddf5d596c454a98e42fa94f4e756180cba2c6984a24cf66"}, + {url = "https://files.pythonhosted.org/packages/89/b3/8979616df203758517165fcf05a4e864152f8151e6ae944586dfac8eda5d/fastparquet-2023.2.0-cp38-cp38-win_amd64.whl", hash = "sha256:6da6e07de55358cc8f74a1943bdecb11a4f87d56d10fe479a8a9354aad37506a"}, + {url = "https://files.pythonhosted.org/packages/b3/4a/1051a5c8d74e2714ae5a5d21358cb0802b5ae6d338366b3132bf106a8386/fastparquet-2023.2.0-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:2d69f539d19177602924714402ef6bde58c2d573f661ace86081a1d3dcbb88ef"}, + {url = "https://files.pythonhosted.org/packages/b3/d6/f63325e5aa9a49cb23b45cda3211ca40ae66ef7f902a3441dfd4731343a0/fastparquet-2023.2.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c63452355ee0bd4650e5fb190957fbce3576e1aa0baeae4c01eb5d90d534a5e5"}, + {url = "https://files.pythonhosted.org/packages/b8/1f/1cfec864f01648a0aa25f4151dcb2e1f801064735787fc05605c1503a9d3/fastparquet-2023.2.0-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:409de263558e81b8a22a9e8f43f925b86d7e16bd4b55611971ef79c50f45244f"}, + {url = "https://files.pythonhosted.org/packages/ce/79/35238aab3155d0ed8615210b86db56ffa2b34f928ab82f0eebe7c32197b2/fastparquet-2023.2.0-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:6509f0865ddd8e8e62bef7ec06198996f47154b9d9005054ed7eb57070bc7fd1"}, + {url = "https://files.pythonhosted.org/packages/d6/89/82aabdac27cdfb6987974edcbbe9fd666ff09915a7365dfc726f337d5532/fastparquet-2023.2.0-cp311-cp311-win_amd64.whl", hash = "sha256:8585ce6ab202a42299e1be413e06dc3c6ee3292a07f09b22d16c845b1481c4cc"}, + {url = "https://files.pythonhosted.org/packages/e1/db/ed0136c4b13a9a93d94fed224a3bc3feca3d1768c792e3d19e8ab3ce6a83/fastparquet-2023.2.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:10a2d9fde179ccdb3a9addad20b52db4a51af1c5be41b6c747765bf314f11c1c"}, + {url = "https://files.pythonhosted.org/packages/e3/cb/8e64af92a2fdf41ade0b5f4a7906c151af2581eb6eef8aae9c6fe12aaf98/fastparquet-2023.2.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:265a3b51663eefc966c5f0d3f0ed697a14c91cb64f50e793d2b7b74920f48002"}, + {url = "https://files.pythonhosted.org/packages/f3/69/24d0ff8af9f2bb628bba69ccaad01253fdb93f6c9893128fccfcfaf7e664/fastparquet-2023.2.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:b1c3f4eefbe472169cd5365064ff4cd0313bcb04bbcf6a4eb144ba4bf4764247"}, + {url = "https://files.pythonhosted.org/packages/f5/79/8d3236355cd1b1736af484b1d40f0cc2e0bbb563a30c8841f3218089ee4d/fastparquet-2023.2.0-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:9d22115f706602e0c216c4ec31f403e347e7aa5c20b181ea8901bd1b55d0dd03"}, ] "filelock 3.9.0" = [ {url = "https://files.pythonhosted.org/packages/0b/dc/eac02350f06c6ed78a655ceb04047df01b02c6b7ea3fc02d4df24ca87d24/filelock-3.9.0.tar.gz", hash = "sha256:7b319f24340b51f55a2bf7a12ac0755a9b03e718311dac567a0f4f7fabd2f5de"}, {url = "https://files.pythonhosted.org/packages/14/4c/b201d0292ca4e0950f0741212935eac9996f69cd66b92a3587e594999163/filelock-3.9.0-py3-none-any.whl", hash = "sha256:f58d535af89bb9ad5cd4df046f741f8553a418c01a7856bf0d173bbc9f6bd16d"}, ] -"frictionless 5.5.3" = [ - {url = "https://files.pythonhosted.org/packages/5c/57/85535cc5eb4d2e6b07c094867752b73d887dbc6fd8e2bb6f3422f0c0c4df/frictionless-5.5.3.tar.gz", hash = "sha256:5080aaac3884384a1eac9bbc305a5b8ae6686a6244e9e240cc087f8c47fc404a"}, - {url = "https://files.pythonhosted.org/packages/bd/7e/b3d6f00a73604e7f24dcadb1c9a3fb92bc899c7665f9ae1d1c3669136a7c/frictionless-5.5.3-py2.py3-none-any.whl", hash = "sha256:50f5361b99f9446319c0e5c142f65d392ee349e37e1393ae84815a99e5617773"}, +"frictionless 5.6.3" = [ + {url = "https://files.pythonhosted.org/packages/8f/3c/dbadf757c02c033751d996dd98c2d8a22a9eea0e02ebfb19f90e9b9e5603/frictionless-5.6.3-py2.py3-none-any.whl", hash = "sha256:7b43fa8ed0927af795afb98a63a0d4c880210d8f9b7aeb46873e0d388b27c557"}, + {url = "https://files.pythonhosted.org/packages/bf/85/a1d20cd216ef472fa78cb77c1cf24b1e05ed757b65484b7ab956292440e4/frictionless-5.6.3.tar.gz", hash = "sha256:3f0c447f6050d73596fc3fbcbf335d361675de523d21e97b5c803eafcc990194"}, ] "frozenlist 1.3.3" = [ {url = "https://files.pythonhosted.org/packages/01/a3/a3c18bfd7bd2a56831b985140f98eb6dda684a2d8b2a54b1077b45c7f9d9/frozenlist-1.3.3-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:23d16d9f477bb55b6154654e0e74557040575d9d19fe78a161bd33d7d76808e8"}, @@ -1847,9 +1838,9 @@ content_hash = "sha256:5276c531843ea109692d7ad0f02395130f0ce42b4b2d4ebbc706b35a7 {url = "https://files.pythonhosted.org/packages/06/98/52a79d64ca8fd2e37eff8da6c78c26b2209f19d8ace8cafd1634453c4393/google-auth-oauthlib-0.8.0.tar.gz", hash = "sha256:81056a310fb1c4a3e5a7e1a443e1eb96593c6bbc55b26c0261e4d3295d3e6593"}, {url = "https://files.pythonhosted.org/packages/98/0e/bfc3d7de5d1788871d1be3e6862fe3e56d92b446909b7b032c373fc4ecab/google_auth_oauthlib-0.8.0-py2.py3-none-any.whl", hash = "sha256:40cc612a13c3336d5433e94e2adb42a0c88f6feb6c55769e44500fc70043a576"}, ] -"google-cloud-bigquery 3.5.0" = [ - {url = "https://files.pythonhosted.org/packages/9a/24/3e3cab94189bbd15c5ba87600ebe124bdf049dbbc19963f91f68fa307bdf/google_cloud_bigquery-3.5.0-py2.py3-none-any.whl", hash = "sha256:358f54c473938b2d022335118b4e56cdcdaf22a5a112fa0cfeb888fd8814ba62"}, - {url = "https://files.pythonhosted.org/packages/f9/8a/4d307594709535f17fdaf5750fa201bbaf723a6f820abcc10361ff2c1db3/google-cloud-bigquery-3.5.0.tar.gz", hash = "sha256:dd3ca84e5be6fa9e0570fb21665a902cc5651cbd045842fb714164c99a2639c4"}, +"google-cloud-bigquery 3.6.0" = [ + {url = "https://files.pythonhosted.org/packages/56/96/d4978c15034a0f51bd33c37e0e9742476d977cdbbe3b27cb80cc91c18428/google-cloud-bigquery-3.6.0.tar.gz", hash = "sha256:5886c14f29097158d59afa74a6732dbdd583e6ef30dc9934a56ad532906de356"}, + {url = "https://files.pythonhosted.org/packages/f5/50/9bf320ca5782ed48178a0a8cb7ad520dea00640a4ca6bf7cbe22951cdb25/google_cloud_bigquery-3.6.0-py2.py3-none-any.whl", hash = "sha256:ed875c3a52050d140660850f04aaa59c1ec03b420a31fa7cc643a1726fc784d1"}, ] "google-cloud-bigquery-storage 2.18.1" = [ {url = "https://files.pythonhosted.org/packages/0b/94/8a9c6edf3be69e37274f71598e715a3ac5a6306a6caba6044fab85e5b712/google-cloud-bigquery-storage-2.18.1.tar.gz", hash = "sha256:5cd3de59ef27606989aff315c6dd2e05931fd0dd90129e616207b444945652a0"}, @@ -2113,9 +2104,9 @@ content_hash = "sha256:5276c531843ea109692d7ad0f02395130f0ce42b4b2d4ebbc706b35a7 {url = "https://files.pythonhosted.org/packages/ac/a2/0260c0f5d73bdf06e8d3fc1013a82b9f0633dc21750c9e3f3cb1dba7bb8c/httpx-0.23.3-py3-none-any.whl", hash = "sha256:a211fcce9b1254ea24f0cd6af9869b3d29aba40154e947d2a07bb499b3e310d6"}, {url = "https://files.pythonhosted.org/packages/f5/50/04d5e8ee398a10c767a341a25f59ff8711ae3adf0143c7f8b45fc560d72d/httpx-0.23.3.tar.gz", hash = "sha256:9818458eb565bb54898ccb9b8b251a28785dd4a55afbc23d0eb410754fe7d0f9"}, ] -"humanize 4.5.0" = [ - {url = "https://files.pythonhosted.org/packages/ab/bf/4e526ef224ca00f0a2f14513895c8a728aa94682ebbe756447de41230baa/humanize-4.5.0-py3-none-any.whl", hash = "sha256:127e333677183070b82e90e0faef9440f9a16dab92143e52f4523afb08ca9290"}, - {url = "https://files.pythonhosted.org/packages/de/ec/c9fa9a0e2b917bd74c18f9752912fd389b7d8e796cfb864e3c485a9bda5d/humanize-4.5.0.tar.gz", hash = "sha256:d6ed00ed4dc59a66df71012e3d69cf655d7d21b02112d435871998969e8aedc8"}, +"humanize 4.6.0" = [ + {url = "https://files.pythonhosted.org/packages/06/b1/9e491df2ee1c919d67ee328d8bc9f17b7a9af68e4077f3f5fac83a4488c9/humanize-4.6.0.tar.gz", hash = "sha256:5f1f22bc65911eb1a6ffe7659bd6598e33dcfeeb904eb16ee1e705a09bf75916"}, + {url = "https://files.pythonhosted.org/packages/22/2b/30e8725481b071ca53984742a443f944f9c74fb72f509a40b746912645e1/humanize-4.6.0-py3-none-any.whl", hash = "sha256:401201aca462749773f02920139f302450cb548b70489b9b4b92be39fe3c3c50"}, ] "idna 3.4" = [ {url = "https://files.pythonhosted.org/packages/8b/e1/43beb3d38dba6cb420cefa297822eac205a277ab43e5ba5d5c46faf96438/idna-3.4.tar.gz", hash = "sha256:814f528e8dead7d329833b91c5faa87d60bf71824cd12a7530b5526063d02cb4"}, @@ -2649,43 +2640,43 @@ content_hash = "sha256:5276c531843ea109692d7ad0f02395130f0ce42b4b2d4ebbc706b35a7 {url = "https://files.pythonhosted.org/packages/d1/72/d9ed0f68dfdd7322c3e0e4bcb0139b172bb66159dd2a5a95c6e2ce6f7a23/pycryptodomex-3.17-cp27-cp27m-musllinux_1_1_aarch64.whl", hash = "sha256:f854c8476512cebe6a8681cc4789e4fcff6019c17baa0fd72b459155dc605ab4"}, {url = "https://files.pythonhosted.org/packages/da/5f/4b904abe20347c88df413533ac88ad813e049639cc7e356673c8fe5fa450/pycryptodomex-3.17-cp27-cp27m-manylinux2010_x86_64.whl", hash = "sha256:caa937ff29d07a665dfcfd7a84f0d4207b2ebf483362fa9054041d67fdfacc20"}, ] -"pydantic 1.10.4" = [ - {url = "https://files.pythonhosted.org/packages/02/6b/c4b5773bcc216652cc6a040eb32697f99770cf9274d8ad254e621eb3fdd1/pydantic-1.10.4-cp37-cp37m-musllinux_1_1_i686.whl", hash = "sha256:a9f2de23bec87ff306aef658384b02aa7c32389766af3c5dee9ce33e80222dfa"}, - {url = "https://files.pythonhosted.org/packages/09/46/66c65d678e4c1b151c36bd61fd7ad9ebd1b48ecccc115d5dc77c1d7fe476/pydantic-1.10.4-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:05a81b006be15655b2a1bae5faa4280cf7c81d0e09fcb49b342ebf826abe5a72"}, - {url = "https://files.pythonhosted.org/packages/12/74/797cf42ee7093e73f740224ee7f9d3faba6e5f674243078a51fc38ba7a78/pydantic-1.10.4-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:51bdeb10d2db0f288e71d49c9cefa609bca271720ecd0c58009bd7504a0c464c"}, - {url = "https://files.pythonhosted.org/packages/17/70/139ae58f5fa5e9000c63d49e1b74a256a74abf4064d7e9b236adc3e21251/pydantic-1.10.4-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:6dc1cc241440ed7ca9ab59d9929075445da6b7c94ced281b3dd4cfe6c8cff817"}, - {url = "https://files.pythonhosted.org/packages/2d/c7/d284a73934b79077ff48c6e64f93dcf570660931c90bafbdadc9867bf929/pydantic-1.10.4-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:cd8702c5142afda03dc2b1ee6bc358b62b3735b2cce53fc77b31ca9f728e4bc8"}, - {url = "https://files.pythonhosted.org/packages/35/b1/c574b4d47ba9565f5984cf406ce06764a07994b1608d89d53207a7f67c33/pydantic-1.10.4-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:f2f7eb6273dd12472d7f218e1fef6f7c7c2f00ac2e1ecde4db8824c457300416"}, - {url = "https://files.pythonhosted.org/packages/36/78/1755a9fe87b0480775bce2e812049669adbe4b006787257d288806caa580/pydantic-1.10.4-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:990406d226dea0e8f25f643b370224771878142155b879784ce89f633541a024"}, - {url = "https://files.pythonhosted.org/packages/49/0c/3cb9ddf7aba9a13c56585401ee7ea345ed583c2f848e783eec634c9726d3/pydantic-1.10.4-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:fdf8d759ef326962b4678d89e275ffc55b7ce59d917d9f72233762061fd04a2d"}, - {url = "https://files.pythonhosted.org/packages/49/90/ff3dd0265279a2f0607995dfcd77720f0130918cf11ee9449b106d99b942/pydantic-1.10.4-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:572066051eeac73d23f95ba9a71349c42a3e05999d0ee1572b7860235b850cc6"}, - {url = "https://files.pythonhosted.org/packages/4a/52/79167d367d0765effd60faef145c54a213a5feab7a5c97055fa368f25031/pydantic-1.10.4-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:8775d4ef5e7299a2f4699501077a0defdaac5b6c4321173bcb0f3c496fbadf85"}, - {url = "https://files.pythonhosted.org/packages/4e/26/38b8e36129e1f9e4d5e4481cee0cbc49b778ac103777c50cb2fca714afbe/pydantic-1.10.4-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:eb992a1ef739cc7b543576337bebfc62c0e6567434e522e97291b251a41dad7f"}, - {url = "https://files.pythonhosted.org/packages/53/17/34e54e352f6a3d304044e52d5ddd5cd621a62ec8fb7af08cc73af65dd3e1/pydantic-1.10.4.tar.gz", hash = "sha256:b9a3859f24eb4e097502a3be1fb4b2abb79b6103dd9e2e0edb70613a4459a648"}, - {url = "https://files.pythonhosted.org/packages/54/7e/e111f6ff353af848d44bb4f40311c1ca7dfb284efbf8a41122a6091a0996/pydantic-1.10.4-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:d88c4c0e5c5dfd05092a4b271282ef0588e5f4aaf345778056fc5259ba098857"}, - {url = "https://files.pythonhosted.org/packages/58/1b/0132040ef3e8ec0ce96142d4759bde9f16b52ab7eac5f2c1ce3a5b641f16/pydantic-1.10.4-py3-none-any.whl", hash = "sha256:4948f264678c703f3877d1c8877c4e3b2e12e549c57795107f08cf70c6ec7774"}, - {url = "https://files.pythonhosted.org/packages/5f/05/faa76cdd1d58066678b104a8bfa2b657144b1996773d655e2d5abb72bfeb/pydantic-1.10.4-cp310-cp310-win_amd64.whl", hash = "sha256:7feb6a2d401f4d6863050f58325b8d99c1e56f4512d98b11ac64ad1751dc647d"}, - {url = "https://files.pythonhosted.org/packages/67/f7/05de7f3998a365725ea26ed44ce242dfa4e7ddb4fd849fd36902ff0a6715/pydantic-1.10.4-cp37-cp37m-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:a48f1953c4a1d9bd0b5167ac50da9a79f6072c63c4cef4cf2a3736994903583e"}, - {url = "https://files.pythonhosted.org/packages/6b/85/c3c30a050f04668dccf4ce8df015242a7ccaea8dface44b342f173f68991/pydantic-1.10.4-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:2b3ce5f16deb45c472dde1a0ee05619298c864a20cded09c4edd820e1454129f"}, - {url = "https://files.pythonhosted.org/packages/6e/00/7e25a76d3629999587ea4f30b0b15f52a14a43c811a80168900005500f9b/pydantic-1.10.4-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:2e82a6d37a95e0b1b42b82ab340ada3963aea1317fd7f888bb6b9dfbf4fff57c"}, - {url = "https://files.pythonhosted.org/packages/6f/6a/a3b9a51b886eeee570ddb32ae64a8d2fd00cd25cb1daaf82260188d2d1e4/pydantic-1.10.4-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:fdf88ab63c3ee282c76d652fc86518aacb737ff35796023fae56a65ced1a5978"}, - {url = "https://files.pythonhosted.org/packages/7a/9c/3a9db59d67755033edb1588e6d412806fe8023ac5bdbf87a9b8806205bd7/pydantic-1.10.4-cp37-cp37m-win_amd64.whl", hash = "sha256:6e7124d6855b2780611d9f5e1e145e86667eaa3bd9459192c8dc1a097f5e9903"}, - {url = "https://files.pythonhosted.org/packages/80/79/51583ea13a70715d497be473fc73596142d751dfae956a39b3a0196bc506/pydantic-1.10.4-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:d7b5a3821225f5c43496c324b0d6875fde910a1c2933d726a743ce328fbb2a8c"}, - {url = "https://files.pythonhosted.org/packages/87/7e/aec14140cb0ee6b62b5777e9d28eea44813b4d590826ad518b7e197e1200/pydantic-1.10.4-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:983e720704431a6573d626b00662eb78a07148c9115129f9b4351091ec95ecc3"}, - {url = "https://files.pythonhosted.org/packages/88/b4/123955cfb978fb9d2cfde7a92b588cffca5cb3772702a09e4ab5807574b1/pydantic-1.10.4-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:b5635de53e6686fe7a44b5cf25fcc419a0d5e5c1a1efe73d49d48fe7586db854"}, - {url = "https://files.pythonhosted.org/packages/8a/97/8f789eb4ab68abe9541f5765dc7f533dbc3d6c9c94cd70d1b01e21759cf9/pydantic-1.10.4-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:55b1625899acd33229c4352ce0ae54038529b412bd51c4915349b49ca575258f"}, - {url = "https://files.pythonhosted.org/packages/9e/85/13eb8a5121d1d37826118ac8d88fe856229aad43396a3680307eaee8c73e/pydantic-1.10.4-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:78cec42b95dbb500a1f7120bdf95c401f6abb616bbe8785ef09887306792e66e"}, - {url = "https://files.pythonhosted.org/packages/ae/97/c9716e8060e3ed0bbd954258babe4c2f75092ca923972101d791230dcb7e/pydantic-1.10.4-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:9193d4f4ee8feca58bc56c8306bcb820f5c7905fd919e0750acdeeeef0615b28"}, - {url = "https://files.pythonhosted.org/packages/ba/7f/47a90201dc4c11a514dfba59c689491d5018b83be21f682aa602c845c125/pydantic-1.10.4-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:a9a6747cac06c2beb466064dda999a13176b23535e4c496c9d48e6406f92d42d"}, - {url = "https://files.pythonhosted.org/packages/d1/1a/44c9e2fa8d94cfb1d73352205960798d991a1236aec09d15bf702874ac64/pydantic-1.10.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:75d52162fe6b2b55964fbb0af2ee58e99791a3138588c482572bb6087953113a"}, - {url = "https://files.pythonhosted.org/packages/d3/ab/0626c660fa632920c0a2623a07700adacb01986bd22a089f2669596096cd/pydantic-1.10.4-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:0b53e1d41e97063d51a02821b80538053ee4608b9a181c1005441f1673c55423"}, - {url = "https://files.pythonhosted.org/packages/da/e9/82b5585bb1d8a01c6b597fe30ef078ca3939dbbd7c1f7f9a6501062889ec/pydantic-1.10.4-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:b6f9d649892a6f54a39ed56b8dfd5e08b5f3be5f893da430bed76975f3735d15"}, - {url = "https://files.pythonhosted.org/packages/db/2a/41d60a843328d91b12c6efd1a18b17606bd2ebe498647e75721a9317b433/pydantic-1.10.4-cp311-cp311-win_amd64.whl", hash = "sha256:6a05a9db1ef5be0fe63e988f9617ca2551013f55000289c671f71ec16f4985e3"}, - {url = "https://files.pythonhosted.org/packages/de/d4/dcb8e4bc7777e2e0d79381cc4c63cda50e83e355fa10d64082c216905377/pydantic-1.10.4-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:301d626a59edbe5dfb48fcae245896379a450d04baeed50ef40d8199f2733b06"}, - {url = "https://files.pythonhosted.org/packages/df/8d/c52f913e533b2e71a94e7f22148b449abf328c46a5b4a1da4d0e7e9f659e/pydantic-1.10.4-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:887ca463c3bc47103c123bc06919c86720e80e1214aab79e9b779cda0ff92a00"}, - {url = "https://files.pythonhosted.org/packages/ea/45/86ec3475f45f02858808643f38700788c64bfef0896566936dc33a78d4ba/pydantic-1.10.4-cp39-cp39-win_amd64.whl", hash = "sha256:9cbdc268a62d9a98c56e2452d6c41c0263d64a2009aac69246486f01b4f594c4"}, - {url = "https://files.pythonhosted.org/packages/ec/f2/c136265b246eb0411b293763e1b5e18a22de2d8d6a084e5c3d7b9e6e796e/pydantic-1.10.4-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:39f4a73e5342b25c2959529f07f026ef58147249f9b7431e1ba8414a36761f53"}, - {url = "https://files.pythonhosted.org/packages/f4/09/6efdaefc6e967f03af3ae3d5e63575036598eb0c740a43a69a77be054a5f/pydantic-1.10.4-cp38-cp38-win_amd64.whl", hash = "sha256:4b05697738e7d2040696b0a66d9f0a10bec0efa1883ca75ee9e55baf511909d6"}, +"pydantic 1.10.5" = [ + {url = "https://files.pythonhosted.org/packages/10/1d/14dcf2aa8cde579271eee6928d1611b81987da5c21bf7c8ca467c8d2b82f/pydantic-1.10.5-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:532e97c35719f137ee5405bd3eeddc5c06eb91a032bc755a44e34a712420daf3"}, + {url = "https://files.pythonhosted.org/packages/1f/ab/0778d084867668ed4912c4e2001b0d9e0cd4cc54e504a731debf1a70f3a8/pydantic-1.10.5-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:305d0376c516b0dfa1dbefeae8c21042b57b496892d721905a6ec6b79494a66d"}, + {url = "https://files.pythonhosted.org/packages/1f/b6/436e7d212bbaf146164ef3579f1574bcd195bb1dd571b5a10aa307fc8302/pydantic-1.10.5-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:51782fd81f09edcf265823c3bf43ff36d00db246eca39ee765ef58dc8421a642"}, + {url = "https://files.pythonhosted.org/packages/21/75/5e00165a2275186aaa6329e7017eac5a43df885dc826d26963677799cef0/pydantic-1.10.5-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:6bb0452d7b8516178c969d305d9630a3c9b8cf16fcf4713261c9ebd465af0d73"}, + {url = "https://files.pythonhosted.org/packages/23/e2/2bb87450a57bfea0d73f91f81d8cc1f773541fe2f81b46b6446c8934b33f/pydantic-1.10.5-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:3bb99cf9655b377db1a9e47fa4479e3330ea96f4123c6c8200e482704bf1eda2"}, + {url = "https://files.pythonhosted.org/packages/28/59/5d2fc3499d9ce8ce48ee7e00f043d5cc429a9198bd96c3512809428ade15/pydantic-1.10.5.tar.gz", hash = "sha256:9e337ac83686645a46db0e825acceea8e02fca4062483f40e9ae178e8bd1103a"}, + {url = "https://files.pythonhosted.org/packages/30/94/806b9b966b5cd99a05090d5306f8c2f6e8f0a2ac7737ed95e8503248e243/pydantic-1.10.5-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:f836444b4c5ece128b23ec36a446c9ab7f9b0f7981d0d27e13a7c366ee163f8a"}, + {url = "https://files.pythonhosted.org/packages/3f/49/e00c1e4d1525ed01b58bb210509ca4d80eb2d587f0e3772f04fa9116951b/pydantic-1.10.5-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:3f9d9b2be177c3cb6027cd67fbf323586417868c06c3c85d0d101703136e6b31"}, + {url = "https://files.pythonhosted.org/packages/40/61/00570f1b5436ccbbb7ec393a079aee83d8720c97dad039365a2ea0d7a055/pydantic-1.10.5-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:bd46a0e6296346c477e59a954da57beaf9c538da37b9df482e50f836e4a7d4bb"}, + {url = "https://files.pythonhosted.org/packages/52/2e/6df235627e54a46e0cb4eab44a848b53521516a4b6bb55b8a7093998afae/pydantic-1.10.5-cp310-cp310-win_amd64.whl", hash = "sha256:45edea10b75d3da43cfda12f3792833a3fa70b6eee4db1ed6aed528cef17c74e"}, + {url = "https://files.pythonhosted.org/packages/53/68/2a14076f6d68393cee66dcd6a35bf8c93e9fc27db4d9a91589f9b154e04b/pydantic-1.10.5-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:7ce1612e98c6326f10888df951a26ec1a577d8df49ddcaea87773bfbe23ba5cc"}, + {url = "https://files.pythonhosted.org/packages/55/65/ad96ed56ecba85f01465d3caa06bc3e71e8a361d9c9d0a54fb0bee569407/pydantic-1.10.5-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:63200cd8af1af2c07964546b7bc8f217e8bda9d0a2ef0ee0c797b36353914984"}, + {url = "https://files.pythonhosted.org/packages/5b/ba/701da1b3f4a10131692d5e0eca2204b0cfea242db0283383a387f163fc5b/pydantic-1.10.5-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:1fd326aff5d6c36f05735c7c9b3d5b0e933b4ca52ad0b6e4b38038d82703d35b"}, + {url = "https://files.pythonhosted.org/packages/63/01/7c36f13cab83f7a72da53003a1d5e7238f055c2bcae60b90a5fd2bc7c2cc/pydantic-1.10.5-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:76c930ad0746c70f0368c4596020b736ab65b473c1f9b3872310a835d852eb19"}, + {url = "https://files.pythonhosted.org/packages/65/78/9c2c5689c69c1469104769ba7409997f08c08ecc9d56f90e2edf845bdf4f/pydantic-1.10.5-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:3353072625ea2a9a6c81ad01b91e5c07fa70deb06368c71307529abf70d23325"}, + {url = "https://files.pythonhosted.org/packages/73/9e/f9978c38eb6ea8b34103149978c2e9bc10b0c3628d60962250834c5cbf38/pydantic-1.10.5-cp37-cp37m-win_amd64.whl", hash = "sha256:261f357f0aecda005934e413dfd7aa4077004a174dafe414a8325e6098a8e419"}, + {url = "https://files.pythonhosted.org/packages/75/bd/1dd020c1705d7752410092ade4c64a4a5b4b74dd5ac06ce29764be88a4fb/pydantic-1.10.5-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:5920824fe1e21cbb3e38cf0f3dd24857c8959801d1031ce1fac1d50857a03bfb"}, + {url = "https://files.pythonhosted.org/packages/77/ef/964d596946997395c33179d546484aec844f86971e8d6cb837fe3f6b7593/pydantic-1.10.5-cp38-cp38-win_amd64.whl", hash = "sha256:f5bee6c523d13944a1fdc6f0525bc86dbbd94372f17b83fa6331aabacc8fd08e"}, + {url = "https://files.pythonhosted.org/packages/7a/5a/35a1f25b31208f406df6b828aede5fa2ed74bc2310e4f484ad9a7b0a2047/pydantic-1.10.5-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:b473d00ccd5c2061fd896ac127b7755baad233f8d996ea288af14ae09f8e0d1e"}, + {url = "https://files.pythonhosted.org/packages/89/c7/a55f25e6161d1de2dc9b2c5a3691213f10a5c6f65e655c33ea56cb0bddbe/pydantic-1.10.5-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:b429f7c457aebb7fbe7cd69c418d1cd7c6fdc4d3c8697f45af78b8d5a7955760"}, + {url = "https://files.pythonhosted.org/packages/91/d3/ade57023af199e5bbac09219952300135dcb8e0f410861bc0323075f6fe2/pydantic-1.10.5-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:f582cac9d11c227c652d3ce8ee223d94eb06f4228b52a8adaafa9fa62e73d5c9"}, + {url = "https://files.pythonhosted.org/packages/92/c3/bae023ba6d8a9e71a7346df426d695b3b5d3e62ebf7134ff6eeb620f2c84/pydantic-1.10.5-cp311-cp311-win_amd64.whl", hash = "sha256:8481dca324e1c7b715ce091a698b181054d22072e848b6fc7895cd86f79b4449"}, + {url = "https://files.pythonhosted.org/packages/9b/62/672879ef41f0782b48ec1a1bb1241e68f770e46a3acc09ea8565c1c2897c/pydantic-1.10.5-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:663d2dd78596c5fa3eb996bc3f34b8c2a592648ad10008f98d1348be7ae212fb"}, + {url = "https://files.pythonhosted.org/packages/9d/3f/9834f773ce782c32e641dfc4b89973b9e48b413516d8cd4aa4531c735a66/pydantic-1.10.5-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:c9e5b778b6842f135902e2d82624008c6a79710207e28e86966cd136c621bfee"}, + {url = "https://files.pythonhosted.org/packages/a0/4e/4defb6a0294288fde74164791626e553fc8c9f34a7bda625a982ceffa9b5/pydantic-1.10.5-cp37-cp37m-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:58e41dd1e977531ac6073b11baac8c013f3cd8706a01d3dc74e86955be8b2c0c"}, + {url = "https://files.pythonhosted.org/packages/ab/23/1f3c2874bbdab881e85a887eb4834b6cb7d7ce8b1482b8eeb74231a0325a/pydantic-1.10.5-cp39-cp39-win_amd64.whl", hash = "sha256:5f3bc8f103b56a8c88021d481410874b1f13edf6e838da607dcb57ecff9b4594"}, + {url = "https://files.pythonhosted.org/packages/b0/44/b08588a7036c668f307c7ad97d8601940791fc7943c9d6f715424364a75c/pydantic-1.10.5-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:ca9075ab3de9e48b75fa8ccb897c34ccc1519177ad8841d99f7fd74cf43be5bf"}, + {url = "https://files.pythonhosted.org/packages/bf/68/6ae8ad2d27e865957fce0e101f4284e746620df15df931933f7774670f2d/pydantic-1.10.5-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:2185a3b3d98ab4506a3f6707569802d2d92c3a7ba3a9a35683a7709ea6c2aaa2"}, + {url = "https://files.pythonhosted.org/packages/c7/18/9b9da08649715f0ee99db6f416b32649b2209aa9d23c87ea636670aac071/pydantic-1.10.5-py3-none-any.whl", hash = "sha256:7c5b94d598c90f2f46b3a983ffb46ab806a67099d118ae0da7ef21a2a4033b28"}, + {url = "https://files.pythonhosted.org/packages/c9/fb/d8df7a150c1ecaf768b706f80730626b09c8cca479c685abe736625268d5/pydantic-1.10.5-cp37-cp37m-musllinux_1_1_i686.whl", hash = "sha256:6a4b0aab29061262065bbdede617ef99cc5914d1bf0ddc8bcd8e3d7928d85bd6"}, + {url = "https://files.pythonhosted.org/packages/d0/5f/4e1ead49d245ffb1933c8ca5d4d72adad9881d3001619c3930fe644a89f9/pydantic-1.10.5-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:9a9d9155e2a9f38b2eb9374c88f02fd4d6851ae17b65ee786a87d032f87008f8"}, + {url = "https://files.pythonhosted.org/packages/d4/47/951763175d317975ba9c7e8df0a087ff19fc955a04bebd56841d34fa5509/pydantic-1.10.5-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:c428c0f64a86661fb4873495c4fac430ec7a7cef2b8c1c28f3d1a7277f9ea5ab"}, + {url = "https://files.pythonhosted.org/packages/e6/24/d9ff5e94c23c778447b7ad19c18c47228121cd12e60c7f71b925b9c628d4/pydantic-1.10.5-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:87f831e81ea0589cd18257f84386bf30154c5f4bed373b7b75e5cb0b5d53ea87"}, + {url = "https://files.pythonhosted.org/packages/f0/64/1c98e2a96f70cc651253713bb464a604f7f5dd575a0bcc07e7434a2b3347/pydantic-1.10.5-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:3257bd714de9db2102b742570a56bf7978e90441193acac109b1f500290f5718"}, + {url = "https://files.pythonhosted.org/packages/f4/cb/7299ad5462f30555c9573a7b406d762841f1296b4ffecb800264ff6b5200/pydantic-1.10.5-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:72ef3783be8cbdef6bca034606a5de3862be6b72415dc5cb1fb8ddbac110049a"}, + {url = "https://files.pythonhosted.org/packages/ff/11/9db43f7cd6fe4f22170b282f9742b2d3b645d7d84cecc5221b4d7c50af44/pydantic-1.10.5-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:36e44a4de37b8aecffa81c081dbfe42c4d2bf9f6dff34d03dce157ec65eb0f15"}, ] "pygments 2.14.0" = [ {url = "https://files.pythonhosted.org/packages/0b/42/d9d95cc461f098f204cd20c85642ae40fbff81f74c300341b8d0e0df14e0/Pygments-2.14.0-py3-none-any.whl", hash = "sha256:fa7bd7bd2771287c0de303af8bfdfc731f51bd2c6a47ab69d117138893b82717"}, @@ -2752,9 +2743,9 @@ content_hash = "sha256:5276c531843ea109692d7ad0f02395130f0ce42b4b2d4ebbc706b35a7 {url = "https://files.pythonhosted.org/packages/64/62/f19d1e9023aacb47241de3ab5a5d5fedf32c78a71a9e365bb2153378c141/python_dotenv-0.21.1-py3-none-any.whl", hash = "sha256:41e12e0318bebc859fcc4d97d4db8d20ad21721a6aa5047dd59f090391cb549a"}, {url = "https://files.pythonhosted.org/packages/f5/d7/d548e0d5a68b328a8d69af833a861be415a17cb15ce3d8f0cd850073d2e1/python-dotenv-0.21.1.tar.gz", hash = "sha256:1c93de8f636cde3ce377292818d0e440b6e45a82f215c3744979151fa8151c49"}, ] -"python-slugify 8.0.0" = [ - {url = "https://files.pythonhosted.org/packages/3b/0e/95f48766da1472daa32b50eecbd444bfffda6d451669d27d1d8d56392487/python_slugify-8.0.0-py2.py3-none-any.whl", hash = "sha256:51f217508df20a6c166c7821683384b998560adcf8f19a6c2ca8b460528ccd9c"}, - {url = "https://files.pythonhosted.org/packages/49/3b/492affa71ccdeaadce1a6fba17e12fec301820b19b8cd7220d849686f8ec/python-slugify-8.0.0.tar.gz", hash = "sha256:f1da83f3c7ab839b3f84543470cd95bdb5a81f1a0b80fed502f78b7dca256062"}, +"python-slugify 8.0.1" = [ + {url = "https://files.pythonhosted.org/packages/b4/85/6aa722a11307ec572682023b76cad4c52cda708dfc25fcb4b4a6051da7ab/python_slugify-8.0.1-py2.py3-none-any.whl", hash = "sha256:70ca6ea68fe63ecc8fa4fcf00ae651fc8a5d02d93dcd12ae6d4fc7ca46c4d395"}, + {url = "https://files.pythonhosted.org/packages/de/63/0f60208d0d3dde1a87d30a82906fa9b00e902b57f1ae9565d780de4b41d1/python-slugify-8.0.1.tar.gz", hash = "sha256:ce0d46ddb668b3be82f4ed5e503dbc33dd815d83e2eb6824211310d3fb172a27"}, ] "pytz 2022.7.1" = [ {url = "https://files.pythonhosted.org/packages/03/3e/dc5c793b62c60d0ca0b7e58f1fdd84d5aaa9f8df23e7589b39cc9ce20a03/pytz-2022.7.1.tar.gz", hash = "sha256:01a0681c4b9684a28304615eba55d1ab31ae00bf68ec157ec3708a8182dbbcd0"}, @@ -2928,9 +2919,9 @@ content_hash = "sha256:5276c531843ea109692d7ad0f02395130f0ce42b4b2d4ebbc706b35a7 {url = "https://files.pythonhosted.org/packages/1f/13/fab0a3f512478bc387b66c51557ee715ede8e9811c77ce952f9b9a4d8ac1/shellingham-1.5.0.post1.tar.gz", hash = "sha256:823bc5fb5c34d60f285b624e7264f4dda254bc803a3774a147bf99c0e3004a28"}, {url = "https://files.pythonhosted.org/packages/ae/2a/7ad62b2c56e71c6330fc35cfd5813376e788146ef7c884cc2fdf5fe77696/shellingham-1.5.0.post1-py2.py3-none-any.whl", hash = "sha256:368bf8c00754fd4f55afb7bbb86e272df77e4dc76ac29dbcbb81a59e9fc15744"}, ] -"simpleeval 0.9.12" = [ - {url = "https://files.pythonhosted.org/packages/7d/39/d5be0242308735b87bea7dc8fdadaca1056d4e73a3e7db6c5f0d20a90f7f/simpleeval-0.9.12-py2.py3-none-any.whl", hash = "sha256:d82faa7dc88379614ea3b385fd84cc24f0aa4853432e267718526e5aeac6b1b9"}, - {url = "https://files.pythonhosted.org/packages/bc/9f/4b77fc4b6b988fc8e60a2f09f97e31c7aa2a02152ad22566726656d584da/simpleeval-0.9.12.tar.gz", hash = "sha256:3e0be507486d4e21cf9d08847c7e57dd61a1603950399985f7c5a0be7fd33e36"}, +"simpleeval 0.9.13" = [ + {url = "https://files.pythonhosted.org/packages/0a/51/bedb4af4f3fe4bb32a3cabfd285be388958c6d676f6b0fa65997812a381b/simpleeval-0.9.13-py2.py3-none-any.whl", hash = "sha256:22a2701a5006e4188d125d34accf2405c2c37c93f6b346f2484b6422415ae54a"}, + {url = "https://files.pythonhosted.org/packages/8f/fa/d2d5bbf9a03fe7b0956368ac5420cfcb072146be6e912a50747dc376133a/simpleeval-0.9.13.tar.gz", hash = "sha256:4a30f9cc01825fe4c719c785e3762623e350c4840d5e6855c2a8496baaa65fac"}, ] "six 1.16.0" = [ {url = "https://files.pythonhosted.org/packages/71/39/171f1c67cd00715f190ba0b100d606d440a28c93c7714febeca8b79af85e/six-1.16.0.tar.gz", hash = "sha256:1e61c37477a1626458e36f7b1d82aa5c9b094fa4802892072e49de9c60c4c926"}, diff --git a/pyproject.toml b/pyproject.toml index 92fe91cb..d3125d01 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,7 +1,7 @@ [project] name = "recap-core" -version = "0.4.2" -description = "A dead simple data catalog for engineers" +version = "0.5.0" +description = "A metadata toolkit written in Python" authors = [ {name = "Chris Riccomini", email = "criccomini@apache.org"}, ] @@ -11,11 +11,13 @@ dependencies = [ "httpx>=0.23.1", "typer>=0.7.0", "sqlalchemy>=1.4.45", - "dynaconf>=3.1.11", "rich>=12.6.0", "setuptools>=65.6.3", - "tomli>=2.0.1", - "dacite>=1.8.0", + "starlette>=0.22.0", + "pydantic[dotenv]>=1.10.5", + "pandas>=1.5.3", + "fsspec>=2023.1.0", + "frictionless[json,parquet]>=5.6.3", ] # < 3.11 for sqlalchemy-bigquery compatibility requires-python = ">=3.10, <3.11" @@ -40,35 +42,7 @@ repository = "https://github.com/recap-cloud/recap" [project.scripts] recap = "recap.cli:app" -[project.entry-points."recap.analyzers"] -"frictionless.columns" = "recap.analyzers.frictionless.columns" -"sqlalchemy.columns" = "recap.analyzers.sqlalchemy.columns" - -[project.entry-points."recap.browsers"] -db = "recap.browsers.db" -fs = "recap.browsers.fs" - -[project.entry-points."recap.catalogs"] -db = "recap.catalogs.db" -recap = "recap.catalogs.recap" - -[project.entry-points."recap.commands"] -catalog = "recap.commands.catalog:app" -crawl = "recap.commands.crawl:app" -plugins = "recap.commands.plugins:app" -serve = "recap.commands.serve:app" - -[project.entry-points."recap.routers"] -catalog = "recap.routers.catalog:router" - [project.optional-dependencies] -fs = [ - "fsspec>=2023.1.0", - "frictionless[json,parquet]>=5.5.1", -] -pandas = [ - "pandas>=1.5.3", -] gcp = [ "gcsfs>=2023.1.0", "sqlalchemy-bigquery>=1.5.0", @@ -90,6 +64,7 @@ dbs = [ "psycopg2>=2.9.5", "snowflake-sqlalchemy>=1.4.4", "sqlalchemy-bigquery>=1.5.0", + "google-cloud-bigquery>=3.6.0", ] fss = [ "s3fs>=2023.1.0", diff --git a/recap/analyzers/__init__.py b/recap/analyzers/__init__.py deleted file mode 100644 index 6af8034f..00000000 --- a/recap/analyzers/__init__.py +++ /dev/null @@ -1,17 +0,0 @@ -from contextlib import contextmanager -from typing import Generator - -from recap.plugins import load_analyzer_plugins - -from .abstract import AbstractAnalyzer - - -@contextmanager -def create_analyzer( - plugin: str, - **config, -) -> Generator["AbstractAnalyzer", None, None]: - analyzer_plugins = load_analyzer_plugins() - if analyzer_module := analyzer_plugins.get(plugin): - with analyzer_module.create_analyzer(**config) as analyzer: - yield analyzer diff --git a/recap/analyzers/abstract.py b/recap/analyzers/abstract.py deleted file mode 100644 index 85595b63..00000000 --- a/recap/analyzers/abstract.py +++ /dev/null @@ -1,23 +0,0 @@ -from abc import ABC, abstractmethod - -from recap.metadata import Metadata - - -class AbstractAnalyzer(ABC): - """ - The abstract class for all analyzers. Analyzers are responsible for - inspecting data and returning metadata. - """ - - @abstractmethod - def analyze(self, path: str) -> Metadata | None: - """ - Analyze a path for an infrastructure instance. Only the path is - specified because the URL for the instance is passed in via the config - in `create_analyzer()`. - - :returns: Pydantic BaseModel that represents discovered metadata. This - data gets serialized as JSON in the catalog. - """ - - raise NotImplementedError diff --git a/recap/analyzers/frictionless/__init__.py b/recap/analyzers/frictionless/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/recap/analyzers/frictionless/columns.py b/recap/analyzers/frictionless/columns.py deleted file mode 100644 index 0409426d..00000000 --- a/recap/analyzers/frictionless/columns.py +++ /dev/null @@ -1,79 +0,0 @@ -from contextlib import contextmanager -from pathlib import PurePosixPath -from typing import Generator -from urllib.parse import urlparse - -from frictionless import Resource, describe # type: ignore - -from recap.analyzers.abstract import AbstractAnalyzer -from recap.schemas.schema import Field, Schema - -SUPPORTED_SCHEMES = set(["", "file", "http", "https", "s3"]) - - -class FileColumnAnalyzer(AbstractAnalyzer): - """ - Use Frictionless to fetch table schema information for CSV, TSV, JSON, and - Parquet files. The schema simply includes the name and type. - - CSV, TSV, and JSON schemas are inferred using Frictionless's `describe()` - inferrence. - """ - - def __init__(self, url: str): - """ - :param url: Base URL to connect to. The URL may be any format that - Frictionless accepts (local, S3, http, and so on). Local URLs must - start with `file://`. - """ - - self.url = url - - def analyze( - self, - path: str, - ) -> Schema | None: - """ - Analyze a path and return Frictionless's schema information. - - :param path: Path relative to the URL root. - :returns: Frictionless schema description. - """ - - path_posix = PurePosixPath(str(path)) - url_and_path = self.url + str(path_posix) - resource = None - - match path_posix.suffix: - case (".csv" | ".tsv" | ".parquet"): - resource = describe(url_and_path) - case (".json" | ".ndjson" | ".jsonl"): - resource = describe(path=url_and_path, format="ndjson") - - if isinstance(resource, Resource): - return Schema( - fields=[ - Field( - name=field.name, - type=field.type, - ) - for field in resource.schema.fields # pyright: ignore [reportOptionalMemberAccess] - if field.name - ], - ) - - -@contextmanager -def create_analyzer( - url: str, - **_, -) -> Generator[FileColumnAnalyzer, None, None]: - scheme = urlparse(url).scheme - if scheme in SUPPORTED_SCHEMES: - if scheme == "": - # Frictionless is paranoid about absolute paths. Use a file scheme - # so that it allows them. - url = f"file://{url}" - yield FileColumnAnalyzer(url) - else: - raise ValueError(f"Unsupported url={url}") diff --git a/recap/analyzers/sqlalchemy/__init__.py b/recap/analyzers/sqlalchemy/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/recap/analyzers/sqlalchemy/columns.py b/recap/analyzers/sqlalchemy/columns.py deleted file mode 100644 index b2a113b0..00000000 --- a/recap/analyzers/sqlalchemy/columns.py +++ /dev/null @@ -1,57 +0,0 @@ -import logging -from contextlib import contextmanager -from typing import Generator - -import sqlalchemy - -from recap.analyzers.abstract import AbstractAnalyzer -from recap.browsers.db import DatabaseURL, create_browser -from recap.schemas.schema import Field, Schema - -log = logging.getLogger(__name__) - - -class TableColumnAnalyzer(AbstractAnalyzer): - """ - Use SQLAlchemy to fetch table schema information for a table or view. The - schema uses SQLAlchemy's schema format. - """ - - def __init__(self, engine: sqlalchemy.engine.Engine): - self.engine = engine - - def analyze( - self, - path: str, - ) -> Schema | None: - """ - :param path: Fetch column schema information for a table or view at - this path. - :returns: Column schema information. - """ - - match DatabaseURL(str(self.engine.url), path): - case DatabaseURL(schema=str(schema), table=str(table)): - results = {} - columns = sqlalchemy.inspect(self.engine).get_columns( - table, - schema, - ) - return Schema( - fields=[ - Field( - name=column["name"], - type=str(column["type"]), - default=column["default"], - nullable=column["nullable"], - comment=column.get("comment"), - ) - for column in columns - ], - ) - - -@contextmanager -def create_analyzer(**config) -> Generator["TableColumnAnalyzer", None, None]: - with create_browser(**config) as browser: - yield TableColumnAnalyzer(browser.engine) diff --git a/recap/browsers/__init__.py b/recap/browsers/__init__.py deleted file mode 100644 index 889ce990..00000000 --- a/recap/browsers/__init__.py +++ /dev/null @@ -1,30 +0,0 @@ -""" -Different data infrastructure have different types of objects (tables, columns, -files, topics, partitions, and so on). - -Recap uses a browser abstraction map infrastructure objects into a standard -directory format. A different browser is used for each type of infrastructure. -Browsers do not actually analyze a system's data for metadata; they simply show -what's available. - -Recap comes with a database browser and a filesystem browser. Other browsers -can be implemented and added as plugins. -""" - -from contextlib import contextmanager -from typing import Generator - -from recap.plugins import load_browser_plugins - -from .abstract import AbstractBrowser - - -@contextmanager -def create_browser( - plugin: str, - **config, -) -> Generator["AbstractBrowser", None, None]: - browser_plugins = load_browser_plugins() - if browser_module := browser_plugins.get(plugin): - with browser_module.create_browser(**config) as browser: - yield browser diff --git a/recap/browsers/abstract.py b/recap/browsers/abstract.py deleted file mode 100644 index 4a2e971a..00000000 --- a/recap/browsers/abstract.py +++ /dev/null @@ -1,41 +0,0 @@ -from abc import ABC, abstractmethod - -from recap.url import URL - - -class AbstractBrowser(ABC): - """ - The abstract class for all browsers. Recap uses a browser abstraction to - deal with different data infrastructure in a standard way. - - Browsers map infrastructure objects into a directory format. A different - browser is used for each type of infrastructure. - - A DatabaseBrowser might list directories like this: - - / - /some_db - /some_db/some_table - /some_db/some_other_table - """ - - url: URL - - @abstractmethod - def children(self, path: str) -> list[str] | None: - """ - Given a path, returns its children. Using the example above, - path=/some_db would return: - - [ - "some_table", - "some_other_table" - ] - - The path parameter is relative to the input path. - - :returns: List of child paths relative to input path. None if path - doesn't exist. - """ - - raise NotImplementedError diff --git a/recap/browsers/analyzing.py b/recap/browsers/analyzing.py deleted file mode 100644 index 80fddd8b..00000000 --- a/recap/browsers/analyzing.py +++ /dev/null @@ -1,134 +0,0 @@ -import logging -from contextlib import ExitStack, contextmanager -from typing import Any, Generator - -from recap.analyzers import create_analyzer -from recap.analyzers.abstract import AbstractAnalyzer -from recap.browsers import create_browser as create_wrapped_browser -from recap.browsers.abstract import AbstractBrowser -from recap.metadata import Metadata -from recap.plugins import load_analyzer_plugins, load_browser_plugins - -log = logging.getLogger(__name__) - - -class AnalyzingBrowser(AbstractBrowser): - """ - A special browser that ties an AbstractBrowser together with its compatible - AbstractAnalyzers. Browsers and analyzers are deliberately unaware of one - another, but something has to tie them together. That's this class. - - AnalyzingBrowser takes in a browser and its compatible analyzers. It - provides all of the same functionality as the wrapped browser, but it also - exposes an `analyze()` method, which runs all analyzers on a path and - returns the analyzed metadata. - - This browser is deliberately not added to Recap's browser plugin - entrypoint. It's more of a utility class for the CLI and crawler. - """ - - def __init__( - self, - browser: AbstractBrowser, - analyzers: list[AbstractAnalyzer], - ): - self.browser = browser - self.analyzers = analyzers - self.url = browser.url - - def children(self, path: str) -> list[str] | None: - return self.browser.children(path) - - def analyze(self, path: str) -> list[Metadata]: - results = [] - for analyzer in self.analyzers: - log.debug( - "Analyzing path=%s analyzer=%s", - path, - analyzer.__class__.__name__, - ) - try: # EAFP - if metadata := analyzer.analyze(path): - results.append(metadata) - except Exception as e: - log.debug( - "Unable to process path with analyzer path=%s analyzer=%s", - path, - analyzer.__class__.__name__, - exc_info=e, - ) - return results - - -@contextmanager -def create_browser(**config) -> Generator["AnalyzingBrowser", None, None]: - """ - Create an AnalyzingBrowser that wraps an AbstractBrowser and - AbstractAnalyzers that are compatible with the browser's child - CatalogPaths. - - Note that the `plugin` config is currently ignored. This method simply - searches for the first AbstractBrowser that doesn't throw an exception when - its `create_browser` method is called. - - :param config: A **kwargs config for the browser to wrap. - """ - - analyzer_plugins = load_analyzer_plugins() - browser_plugins = load_browser_plugins() - - with ExitStack() as stack: - url = config.get("url") - excludes = config.get("excludes", []) - browser = None - analyzers = [] - - # Find a real AbstractBrowser to wrap - for browser_name in browser_plugins.keys(): - try: - browser_context_manager = create_wrapped_browser( - plugin=browser_name, - **config, - ) - browser = stack.enter_context(browser_context_manager) - - # If we got this far, we found a browser. Stop looking. - break - except Exception as e: - log.debug( - "Skipped browser for url=%s name=%s", - url, - browser_name, - exc_info=e, - ) - - assert browser, f"Found no browser for url={url}" - - # Find analyzers compatible with the real AbstractBrowser's config. - for analyzer_name in analyzer_plugins.keys(): - if analyzer_name not in excludes: - try: - analyzer_context_manager = create_analyzer( - plugin=analyzer_name, - **config, - ) - analyzer = stack.enter_context( - analyzer_context_manager, - ) - analyzers.append(analyzer) - except Exception as e: - log.debug( - "Skipped analyzer for url=%s name=%s", - url, - analyzer_name, - exc_info=e, - ) - - if not analyzers: - log.warn( - "Found no metadata analyzers for url=%s. " - "Crawling will only create directories.", - url, - ) - - yield AnalyzingBrowser(browser, analyzers) diff --git a/recap/browsers/db.py b/recap/browsers/db.py deleted file mode 100644 index 75604b6f..00000000 --- a/recap/browsers/db.py +++ /dev/null @@ -1,178 +0,0 @@ -import logging -from contextlib import contextmanager -from functools import cached_property -from typing import Any, Callable, Generator - -import sqlalchemy - -from recap.url import URL - -from .abstract import AbstractBrowser - -log = logging.getLogger(__name__) - - -class DatabaseURL(URL): - def __init__( - self, - url: str, - subpath: str | None = None, - ): - super().__init__(url, subpath) - self.catalog: str | None = None - self.schema: str | None = None - self.table: str | None = None - parts = self.path_posix.parts if self.path_posix else [] - - match (self.dialect, self.host_port, parts): - case ("mysql", _, [*schema_table]): - self.catalog = "def" - if len(schema_table) > 1: - self.schema = schema_table[1] - if len(schema_table) > 2: - self.table = schema_table[2] - case ("bigquery", catalog, [*schema_table]): - self.catalog = catalog - if len(schema_table) > 1: - self.schema = schema_table[1] - if len(schema_table) > 2: - self.table = schema_table[2] - case (_, _, [*catalog_schema_table]): - if len(catalog_schema_table) > 1: - self.catalog = catalog_schema_table[1] - if len(catalog_schema_table) > 2: - self.schema = catalog_schema_table[2] - if len(catalog_schema_table) > 3: - self.table = catalog_schema_table[3] - case _: - raise ValueError(f"Invalid url={self.url}") - - -class DatabaseBrowser(AbstractBrowser): - """ - A browser that lists database objects. DatabaseBrowser uses SQLAlchemy and - its supported dialects (https://docs.sqlalchemy.org/en/14/dialects/). - - DatabaseBrowser follows this directory sturcture: - - /databases//instances//schemas//tables/ - /databases//instances//schemas//views/ - - The scheme is derived from the DB URL (e.g. - postgres://user:pass@localhost/my_db would have scheme="postgres"). The - instance defaults to the hostname, but can be overridden via the `name` - config. Schema is whatever SQLAlchemy returns from `get_schema_names`. - Ditto for table and view. - - This model does not make the distinction between schemas and databases as - defined in standard information_schema formats - (https://en.wikipedia.org/wiki/Information_schema). PostgreSQL, in - particular, is a little weird because it has both (the schema is usually - `public`). - """ - - def __init__( - self, - engine: sqlalchemy.engine.Engine, - ): - """ - :param engine: SQLAlchemy engine to use when browsing the db. - :param root_: The root CatalogPath that represents this DB instance. - """ - - self.engine = engine - self.url = URL(str(engine.url)) - - def children( - self, - path: str, - ) -> list[str] | None: - """ - :param path: Path to list. - :returns: List of children for path, or None if path doesn't exist. - """ - - url = DatabaseURL(f"{self.url}{path}") - - match url: - case DatabaseURL(catalog=_, schema=str(schema), table=None): - return sorted( - self.tables(schema) + self.views(schema), - key=str.casefold, - ) - case DatabaseURL(catalog=_, schema=None, table=None): - return self.schemas() - - def schemas(self) -> list[str]: - """ - :returns: All schema names in a database. In PostgreSQL, this is - usually just `public`. In MySQL and others, it's usually a list of - all database names. - """ - - return sqlalchemy.inspect(self.engine).get_schema_names() - - def tables(self, schema: str) -> list[str]: - """ - :returns: All table names in a schema. - """ - - return self._tables_or_views( - schema, - sqlalchemy.inspect(self.engine).get_table_names, - ) - - def views(self, schema: str) -> list[str]: - """ - :returns: All view names in a schema. - """ - return self._tables_or_views( - schema, - sqlalchemy.inspect(self.engine).get_view_names, - ) - - def _tables_or_views( - self, - schema: str, - get_method: Callable[[str], list[str]], - ) -> list[str]: - """ - Helper function that gets returns all tables or views for a given - schema. This method exists because some DBs return `.
` - and others just return `
`. To keep things standard, we strip out - the `.` prefix if it exists. - - :param get_method: A SQLAlchemy inspection method; either - (`get_table_names` or `get_schema_names`). - :returns: All views or tables in a schema. - """ - - results = [] - try: - for table_or_view in get_method(schema): - # Stripe schema name from the table/view name. Some dialects - # include the schema name as part of the table/view. Let's keep - # things consistent. - if table_or_view.startswith(f"{schema}."): - table_or_view = table_or_view[len(schema) + 1 :] - results.append(table_or_view) - except Exception as e: - # Just optimistically try, and ignore if we can't get info. - # Easier than trying to figure out if permission exists. - log.debug( - "Unable to fetch tables or views for schema=%s", - schema, - exc_info=e, - ) - return results - - -@contextmanager -def create_browser( - url: str, - engine: dict[str, Any] = {}, - **_, -) -> Generator[DatabaseBrowser, None, None]: - yield DatabaseBrowser( - engine=sqlalchemy.create_engine(url, **engine), - ) diff --git a/recap/browsers/fs.py b/recap/browsers/fs.py deleted file mode 100644 index 77740ed9..00000000 --- a/recap/browsers/fs.py +++ /dev/null @@ -1,68 +0,0 @@ -import logging -from contextlib import contextmanager -from typing import Any, Generator - -from fsspec import AbstractFileSystem, get_fs_token_paths - -from recap.url import URL - -from .abstract import AbstractBrowser - -log = logging.getLogger(__name__) - - -class FilesystemBrowser(AbstractBrowser): - """ - A browser that lists filesystem objects. FilesystemBrowser uses fsspec and - its supported implementations - (https://filesystem-spec.readthedocs.io/en/latest/api.html#built-in-implementations). - - FilesystemBrowser mirrors the directory structure in the filesystem. - """ - - def __init__( - self, - fs: AbstractFileSystem, - base_path: str, - ): - self.fs = fs - self.base_path = base_path - self.url = URL(f"{fs.protocol}://{base_path}") - - def children(self, path: str) -> list[str] | None: - absolute_path = self.base_path + path - if self.fs.exists(absolute_path): - if self.fs.isdir(absolute_path): - return [ - child["name"] - # Force detail=True because gcsfs doesn't honor defaults. - for child in self.fs.ls(absolute_path, detail=True) - ] - # Return empty since file exists but isn't a directory. - return [] - - -@contextmanager -def create_browser( - url: str, - storage_options: dict[str, Any] = {}, - **_, -) -> Generator[FilesystemBrowser, None, None]: - """ - :param url: The URL to use for the filesystem. If the URL contains a path, - the FilesystemBrowser will treat all paths relative to the URL path. - :param storage_options: Storage options **kwargs to pass on to the fsspec - filesystem constructor. - """ - - fs, _, paths = get_fs_token_paths(url, storage_options=storage_options) - - assert ( - len(paths) == 1 - ), f"Expected to get exactly 1 path from URL, but got paths={paths}" - - # Don't use DirFileSystem because it doesn't work properly with gcsfs. - yield FilesystemBrowser( - fs=fs, - base_path=paths[0], - ) diff --git a/recap/catalog.py b/recap/catalog.py new file mode 100644 index 00000000..ae361d7e --- /dev/null +++ b/recap/catalog.py @@ -0,0 +1,413 @@ +from __future__ import annotations + +from datetime import datetime +from enum import Enum +from urllib.parse import urlparse + +from recap.metadata import Schema +from recap.registry import FunctionRegistry +from recap.registry import registry as global_registry +from recap.storage.abstract import AbstractStorage, Direction, MetadataSubtype + + +def safe(url: str) -> str: + """ + Strips the `username:password` from a URL. + + Given: + + scheme://user:pass@host:port/some/path;param?some=query#fragment + + Returns: + + scheme://host:port/some/path;param?some=query#fragment + + :param url: URL to make safe. + :returns: A URL with the `username:password` removed. + """ + + parsed_url = urlparse(url) + safe_url = f"{parsed_url.scheme or 'file'}://" + safe_url += parsed_url.netloc.split("@", 1)[-1] + safe_url += parsed_url.path + if params := parsed_url.params: + safe_url += f";{params}" + if query := parsed_url.query: + safe_url += f"?{query}" + if fragment := parsed_url.fragment: + safe_url += f"#{fragment}" + return safe_url + + +class Relationship(Enum): + """ + A relationship between two URLs in the catalog. + """ + + CONTAINS = 1 + """ + The source URL contains the destination URL. This is used to model data + hierarchy. Folders contain files, databases contain schemas, and so on. + """ + + RUNS = 2 + """ + The source URL runs the destination URL. RUNS is usually used to mode job + executation: an account runs a job. + """ + + READS = 3 + """ + The source URL reads the destination URL. READS is used to model both + account access and job lineage. When the source URL is an account, READS + signals that the account has read-access to the destination URL. When the + source URL is a job, READS signals that the job reads the destination URL's + data. + """ + + WRITES = 4 + """ + The source URL writes to the destination URL. WRITES is used to model both + account access and job lineage. When the source URL is an account, WRITES + signals that the account has write-access to the destination URL. When the + source URL is a job, WRITES signals that the job writes to the destination + URL's data. + """ + + +class Catalog: + """ + Recap's catalog defines standard metadata and relationships between URLs. + The only metadata the catalog models is `recap.metadata.Schema`. + + Relationships are modeled as: + + * `account` runs `job` + * `account` reads `data` + * `account` writes `data` + * `job` reads `data` + * `job` writes `data` + * `data` contains `data` + + `data` contains `data` might seem odd at first, but it simply models how + object stores like S3 behave. Databases and tables are all treated as + `data`. + """ + + def __init__( + self, + storage: AbstractStorage, + registry: FunctionRegistry | None = None, + ): + self.storage = storage + self.registry = registry or global_registry + + def ls( + self, + url: str, + time: datetime | None = None, + refresh: bool = False, + **kwargs, + ) -> list[str] | None: + """ + Lists all URLs that the provided URL contains. In a filesystem, `ls` + would return a path's children. + + :param url: Fetch the contained children for this URL. + :param time: Time travel to a point in time. Causes the method to + return children from the last crawl before the provided timestamp. + :param refresh: Bypass the catalog's cache, fetching the data directly + from the URL. Cache the results in the catalog. + :returns: A list of URLs contained by the input URL. None if the input + URL doesn't exist. + """ + + return self._links( + url, + Relationship.CONTAINS, + time, + refresh, + **kwargs, + ) + + def readers( + self, + url: str, + time: datetime | None = None, + refresh: bool = False, + **kwargs, + ) -> list[str] | None: + """ + Lists all URLs that read from the input URL. + + :param url: Fetch readers for this this URL. + :param time: Time travel to a point in time. Causes the method to + return readers from the last crawl before the provided timestamp. + :param refresh: Bypass the catalog's cache, fetching the data directly + from the URL. Cache the results in the catalog. + :returns: A list of URLs that read from the input URL. If the return + URL represents an account, then the account has read-access to the + input URL. If the return URL represents a job, then the account + reads the input URL's data. + """ + + return self._links( + url, + Relationship.READS, + time, + refresh, + Direction.TO, + **kwargs, + ) + + def reads( + self, + url: str, + time: datetime | None = None, + refresh: bool = False, + **kwargs, + ) -> list[str] | None: + """ + Lists all URLs that the input URL reads. + + :param url: Fetch URLs that this URL reads. + :param time: Time travel to a point in time. Causes the method to + return URLs from the last crawl before the provided timestamp. + :param refresh: Bypass the catalog's cache, fetching the data directly + from the URL. Cache the results in the catalog. + :returns: A list of URLs that the input URL has read from. If the input + URL is an account, the account will have read-access to the + returned URLs. If the input URL is a job, the job has read from + the return URLs. + """ + + return self._links( + url, + Relationship.READS, + time, + refresh, + Direction.FROM, + **kwargs, + ) + + def schema( + self, + url: str, + time: datetime | None = None, + refresh: bool = False, + **kwargs, + ) -> Schema | None: + """ + Returns a schema for the URL. + + :param url: URL to fetch schema for. + :param time: Time travel to a point in time. Causes the method to + return a scheam from the last crawl before the provided timestamp. + :param refresh: Bypass the catalog's cache, fetching the schema directly + from the URL. Cache the results in the catalog. + :returns: A schema for the URL or `None` if the URL doesn't exist. + """ + + if time and refresh: + raise ValueError("Unsupported: `refresh` and `time` are both set.") + safe_url = safe(url) + if not refresh and (schema := self.storage.metadata(safe_url, Schema, time)): + return schema + elif not time: + for params, callable in self.registry.metadata_registry.items(): + if match := params.pattern.match(url): + metadata = callable( + **params.method_args(url, **kwargs), + **match.groupdict(), + **kwargs, + ) + match metadata: + case Schema(): + self.storage.write(safe_url, metadata) + return metadata + + def search( + self, + query: str, + metadata_type: type[MetadataSubtype], + time: datetime | None = None, + ) -> list[MetadataSubtype]: + """ + Search for metadata in the catallog. + + :param query: Query to apply to the storage layer. This is usually a + `WHERE` clause, for example: + `"url='file:///tmp/recap-test/foo/foo.json'"` + or + `"json_extract(metadata_obj, '$.fields.name') = 'email'"` + :param metadata_type: The type of metadata to search. + :param time: Time travel to a point in time. Causes the method to + return URLs from the last crawl before the provided timestamp. + :returns: A list of Pydantic `BaseModel` metadata objects that match + the search query. + """ + + return self.storage.search(query, metadata_type, time) + + def writers( + self, + url: str, + time: datetime | None = None, + refresh: bool = False, + **kwargs, + ) -> list[str] | None: + """ + List all URLs that write to the input URL. + + :param url: URL that is written to. + :param time: Time travel to a point in time. Causes the method to + return URLs from the last crawl before the provided timestamp. + :param refresh: Bypass the catalog's cache, fetching the data directly + from the URL. Cache the results in the catalog. + :returns: A list of URLs that have written to the input URL. If the + return URL is an account, the account will have write-access to the + input URL. If the return URL is a job, the job has written to + the input URL. + """ + + return self._links( + url, + Relationship.WRITES, + time, + refresh, + Direction.TO, + **kwargs, + ) + + def writes( + self, + url: str, + time: datetime | None = None, + refresh: bool = False, + **kwargs, + ) -> list[str] | None: + """ + List all URLs that the input URL writes to. + + :param url: Fetch output URLs written to by this input URL. + :param time: Time travel to a point in time. Causes the method to + return URLs from the last crawl before the provided timestamp. + :param refresh: Bypass the catalog's cache, fetching the data directly + from the URL. Cache the results in the catalog. + :returns: A list of URLs that the input URL writes to. If the input URL + is an account, the account will have write-access to the return + URLs. If the input URL is a job, the job has written to the return + URLs. + """ + + return self._links( + url, + Relationship.WRITES, + time, + refresh, + Direction.FROM, + **kwargs, + ) + + def _links( + self, + url: str, + relationship: Relationship, + time: datetime | None = None, + refresh: bool = False, + direction: Direction = Direction.FROM, + **kwargs, + ) -> list[str] | None: + """ + A helper method that fetches links for a given URL. The method manages + fetching from the storage layer. If there's a cache miss (or `refresh` + is set), the method will try to use the function registry to fetch + links straignt from the URL. If new data is fetched, the resulting + links are merged into the storage layer (new links are added and old + links are removed). + + :param url: URL to fetch links for. + :param relationship: The type of relationship to fetch. + :param time: Time travel to a point in time. Causes the method to + return URLs from the last crawl before the provided timestamp. + :param refresh: Bypass the catalog's cache, fetching the data directly + from the URL. Cache the results in the catalog. + :param direction: Determines whether the URL is the source or + destination of the relationship. + :returns: A list of URLs that have a relationship with the input URL. + """ + if time and refresh: + raise ValueError("Unsupported: `refresh` and `time` are both set.") + safe_url = safe(url) + storage_links = self.storage.links( + url=safe_url, + relationship=relationship.name.lower(), + time=time, + direction=direction, + ) + if not refresh and storage_links: + return storage_links + elif not time: + for params, callable in self.registry.relationship_registry.items(): + if ( + relationship.name.lower() == params.relationship + and direction == params.direction + and (match := params.pattern.match(url)) + ): + storage_links_set = set(storage_links) + relationship_type = relationship.name.lower() + links = set( + callable( + **params.method_args(url, **kwargs), + **match.groupdict(), + **kwargs, + ) + ) + # Add new links to storage + for link in links - storage_links_set: + url_from, url_to = safe_url, link + if direction == Direction.TO: + url_from, url_to = url_to, url_from + self.storage.link( + url_from, + relationship_type, + url_to, + ) + # Remove old links from storage + for deleted_link in storage_links_set - links: + url_from, url_to = safe_url, deleted_link + if direction == Direction.TO: + url_from, url_to = url_to, url_from + self.storage.unlink( + url_from, + relationship_type, + url_to, + ) + return list(links) + + +# Force load all recap-core integrations +# This has to be below `functions` to prevent a circular import. +# This can go away once all integrations are entry-point plugins. +import recap.integrations + + +def create_catalog( + url: str | None = None, + registry: FunctionRegistry | None = None, + **storage_opts, +) -> Catalog: + """ + A helper that creates a catalog and its underlying storage layer. + + :param url: URL for the storage layer. If unset, the storage layer uses its + default (A SQLite DB in `$RECAP_HOME/recap.db`). + :param registry: A function registry containting metadata and relationship + functions. If unset, the global registry is used. + :param storage_opts: Extra options passed through to the storage layer. + """ + + from recap.storage import create_storage + + storage = create_storage(url, **storage_opts) + return Catalog(storage, registry or global_registry) diff --git a/recap/catalogs/__init__.py b/recap/catalogs/__init__.py deleted file mode 100644 index 618d9f1d..00000000 --- a/recap/catalogs/__init__.py +++ /dev/null @@ -1,26 +0,0 @@ -""" -Recap catalogs store metadata and expose read and search APIs. Recap ships with -a database catalog and Recap catalog implementation. The database catalog is -enabled by default (with SQLite). -""" - -from contextlib import contextmanager -from typing import Generator - -from recap.plugins import load_catalog_plugins - -from .abstract import AbstractCatalog - - -@contextmanager -def create_catalog( - plugin: str = "db", - **config, -) -> Generator["AbstractCatalog", None, None]: - catalog_plugins = load_catalog_plugins() - catalog_plugin_module = catalog_plugins.get(plugin) - assert ( - catalog_plugin_module - ), f"Unable to find catalog plugin module={catalog_plugin_module}" - with catalog_plugin_module.create_catalog(**config) as catalog: - yield catalog diff --git a/recap/catalogs/abstract.py b/recap/catalogs/abstract.py deleted file mode 100644 index 878369fc..00000000 --- a/recap/catalogs/abstract.py +++ /dev/null @@ -1,95 +0,0 @@ -from abc import ABC, abstractmethod -from datetime import datetime - -from recap.metadata import Metadata, MetadataSubtype - - -class AbstractCatalog(ABC): - """ - The abstract class for all catalogs. Recap catalogs store metadata and - expose read and search APIs. Catalogs follow the same directory structure - as AbstractBrowsers. - """ - - @abstractmethod - def add( - self, - url: str, - metadata: Metadata | None = None, - ): - """ - Add metadata at a URL. - """ - - raise NotImplementedError - - @abstractmethod - def read( - self, - url: str, - type: type[MetadataSubtype], - id: str | None = None, - time: datetime | None = None, - ) -> MetadataSubtype | None: - """ - Read metadata for a URL. - - :returns: Metadata dictionary of the format {"metadata_type": Any}. - """ - - raise NotImplementedError - - @abstractmethod - def children( - self, - url: str, - time: datetime | None = None, - ) -> list[str] | None: - raise NotImplementedError - - @abstractmethod - def all( - self, - url: str, - type: type[MetadataSubtype], - time: datetime | None = None, - ) -> list[MetadataSubtype] | None: - """ - Returns all children in a directory. This method does not signal - whether or not a directory has metadata, since metadata is not a child - of a directory. To check if a path has metadata, call `read`. - - :returns: A list of child names. Does not include absolute path. - """ - - raise NotImplementedError - - @abstractmethod - def remove( - self, - url: str, - type: type[Metadata] | None = None, - id: str | None = None, - ): - """ - Remove a directory or metadata entry. If type is note set, the whole - directory (including all children and metadata) is removed. - """ - - raise NotImplementedError - - @abstractmethod - def search( - self, - query: str, - type: type[MetadataSubtype], - time: datetime | None = None, - ) -> list[MetadataSubtype]: - """ - Searches an entire catalog for metadata. The query syntax is dependent - on the catalog implementation. - - :param query: A query string to match against metadata in a catalog. - """ - - raise NotImplementedError diff --git a/recap/catalogs/db.py b/recap/catalogs/db.py deleted file mode 100644 index 230cbf89..00000000 --- a/recap/catalogs/db.py +++ /dev/null @@ -1,446 +0,0 @@ -from contextlib import contextmanager -from datetime import datetime -from pathlib import Path, PurePosixPath -from typing import Any, Generator -from urllib.parse import urlparse - -from sqlalchemy import Column, DateTime, Index, create_engine, select, update -from sqlalchemy.dialects.postgresql import JSONB -from sqlalchemy.engine import Engine -from sqlalchemy.ext.declarative import declarative_base -from sqlalchemy.orm import sessionmaker -from sqlalchemy.schema import Sequence -from sqlalchemy.sql import func, text -from sqlalchemy.types import JSON, BigInteger, Integer, String - -from recap.config import RECAP_HOME, settings -from recap.metadata import Metadata, MetadataSubtype -from recap.url import URL - -from .abstract import AbstractCatalog - -DEFAULT_URL = f"sqlite:///{settings('root_path', RECAP_HOME)}/catalog/recap.db" -Base = declarative_base() - - -class PathEntry(Base): - __tablename__ = "paths" - - # Sequence instead of autoincrement="auto" for DuckDB compatibility - path_id_seq = Sequence("path_id_seq") - id = Column( - # Use Integer with SQLite since it's suggested by SQLalchemy - BigInteger().with_variant(Integer, "sqlite"), - path_id_seq, - primary_key=True, - ) - - # e.g. "/postgresql/localhost/some_db" - parent = Column(String(65535), nullable=False) - - # e.g. "some_table" - name = Column(String(255), nullable=False) - - created_at = Column( - DateTime, - nullable=False, - server_default=func.now(), - index=True, - ) - - deleted_at = Column(DateTime) - - __table_args__ = ( - Index( - "parent_name_idx", - parent, - name, - ), - ) - - def is_deleted(self) -> bool: - return self.deleted_at is not None - - -class MetadataEntry(Base): - __tablename__ = "metadata" - - # Sequence instead of autoincrement="auto" for DuckDB compatibility - metadata_id_seq = Sequence("metadata_id_seq") - id = Column( - # Use Integer with SQLite since it's suggested by SQLalchemy - BigInteger().with_variant(Integer, "sqlite"), - metadata_id_seq, - primary_key=True, - ) - - # e.g. "/postgresql/localhost/some_db/some_table" - path = Column(String(65535), nullable=False) - - # e.g. "schema", "lineage", "histogram" - metadata_type = Column(String(255), nullable=False) - - # e.g. None, "b3dccfde-0e9c-4585-a8bd-815b28e83101", "1234" - # None is used when type:metadata cardinality is 1:1. - # For examples, tables have only one schema. - metadata_id = Column(String(255)) - - # e.g. '{"fields": [{"name": "some_field", "type": "int32"}]}' - metadata_obj = Column( - JSON().with_variant(JSONB, "postgresql"), - nullable=False, - ) - - created_at = Column( - DateTime, - nullable=False, - server_default=func.now(), - index=True, - ) - - deleted_at = Column(DateTime) - - __table_args__ = ( - Index( - "path_type_id_idx", - path, - metadata_type, - metadata_id, - ), - ) - - def is_deleted(self) -> bool: - return self.deleted_at is not None - - -class DatabaseCatalog(AbstractCatalog): - """ - The database catalog uses [SQLAlchemy](https://www.sqlalchemy.org/) to - persists catalog data. By default, a SQLite database is used; the file is - located in `~/.recap/catalog/recap.db`. Search is implemented using - SQLite's [json_extract syntax](https://www.sqlite.org/json1.html#the_json_extract_function) - syntax. See [Recap CLI](cli.md) for an example. - - # Usage - - You can configure the SQLite catalog in your `settings.toml` like so: - - ```toml - [catalog] - url = "sqlite://" - engine.connect_args.check_same_thread = false - ``` - - Anything under the `engine` namespace will be forwarded to the SQLAlchemy - engine. - - You can use any - [SQLAlchemy dialect](https://docs.sqlalchemy.org/en/14/dialects/) with the - database catalog. Here's a `settings.toml` that's configured for - PostgreSQL: - - ```toml - [catalog] - url = "postgresql://user:pass@localhost/some_db" - ``` - - # Implementation - - DatabaseCatalog stores metadata entries in a `catalog` table using - SQLAlchemy. The table has three main columns: parent, name, and metadata. - The parent and name columns reflect the directory for the metadata (as - defined by an AbstractBrowser). The metadata column contains a JSON blob of - all the various metadata types and objects. - - Previous metadata versions are kept in `catalog` as well. A `deleted_at` - field is used to tombstone deleted directories. Directories that were - updated, not deleted, will not have a `deleted_at` set; there will just be - a more recent row (as sorted by `id`). - - Reads return the most recent metadata that was written to the path. If the - most recent record has a `deleted_at` tombstone, a None is returned. - - Search strings are simply passed along to the WHERE clause in a SELECT - statement. This does leave room for SQL injection attacks; not thrilled - about that. - """ - - def __init__( - self, - engine: Engine, - ): - self.engine = engine - Base.metadata.create_all(engine) - self.Session = sessionmaker(engine) - - def _touch( - self, - path: PurePosixPath, - ): - path_stack = list(path.parts) - cwd = PurePosixPath(*path_stack) - - with self.Session() as session, session.begin(): - # Touch all parents to make sure they exist. - while len(path_stack): - maybe_row = session.scalar( - select( - PathEntry, - ) - .filter( - PathEntry.parent == str(cwd.parent), - PathEntry.name == str(cwd.name), - ) - .order_by( - PathEntry.id.desc(), - ) - ) - - if not maybe_row or maybe_row.is_deleted(): - session.add( - PathEntry( - parent=str(cwd.parent), - name=cwd.name, - ) - ) - else: - # Path exists and isn't deleted. We can assume all - # parents also exist, so no need to check. - break - - path_stack.pop() - cwd = PurePosixPath(*path_stack) - - def add( - self, - url: str, - metadata: Metadata | None = None, - ): - recap_url = URL(url) - path = recap_url.dialect_host_port_path - self._touch(path) - if metadata: - with self.Session() as session, session.begin(): - session.add( - MetadataEntry( - path=str(path), - metadata_type=metadata.key(), - metadata_id=metadata.id(), - metadata_obj=metadata.to_dict(), - ) - ) - - def remove( - self, - url: str, - type: type[Metadata] | None = None, - id: str | None = None, - ): - recap_url = URL(url) - path = recap_url.dialect_host_port_path - if type: - self._remove_metadata(path, type, id) - else: - self._remove_path(path) - - def _remove_path(self, path: PurePosixPath): - with self.Session() as session: - session.execute( - update(PathEntry) - .filter( - # Delete all direct descendants: parent = /foo/bar/baz - (PathEntry.parent == str(path)) - # Delete all indirect descendants: parent = /foo/bar/baz/% - | (PathEntry.parent.like(f"{path}/%")) - # Delete exact match: parent = /foo/bar and name = baz - | ( - (PathEntry.parent == str(path.parent)) - & (PathEntry.name == path.name) - ) - ) - .values(deleted_at=func.now()) - .execution_options(synchronize_session=False) - ) - - session.execute( - update(MetadataEntry) - .filter( - # Delete path metadata: parent = /foo/bar/baz - (MetadataEntry.path == str(path)) - # Delete all descendant metadata: parent = /foo/bar/baz/% - | (MetadataEntry.path.like(f"{path}/%")) - ) - .values(deleted_at=func.now()) - .execution_options(synchronize_session=False) - ) - - # Have to commit since synchronize_session=False. Have to set - # synchronize_session=False because BinaryExpression isn't - # supported in the filter otherwise. - session.commit() - - def _remove_metadata( - self, path: PurePosixPath, type: type[Metadata], id: str | None - ): - with self.Session() as session: - session.execute( - update(MetadataEntry) - .filter( - # Delete path metadata: parent = /foo/bar/baz - (MetadataEntry.path == str(path)) - # Delete all descendant metadata: parent = /foo/bar/baz/% - & (MetadataEntry.metadata_type == type.key()) - & ((MetadataEntry.metadata_id == id) if id is not None else True) - ) - .values(deleted_at=func.now()) - .execution_options(synchronize_session=False) - ) - - # Have to commit since synchronize_session=False. Have to set - # synchronize_session=False because BinaryExpression isn't - # supported in the filter otherwise. - session.commit() - - def children( - self, - url: str, - time: datetime | None = None, - ) -> list[str] | None: - recap_url = URL(url) - path = recap_url.dialect_host_port_path - with self.Session() as session: - subquery = ( - session.query( - PathEntry.name, - PathEntry.deleted_at, - func.rank() - .over( - order_by=PathEntry.id.desc(), - partition_by=( - PathEntry.parent, - PathEntry.name, - ), - ) - .label("rnk"), - ) - .filter( - PathEntry.parent == str(path), - PathEntry.created_at <= (time or func.now()), - ) - .subquery() - ) - query = session.query(subquery).filter( - subquery.c.rnk == 1, - subquery.c.deleted_at == None, - ) - rows = session.execute(query).fetchall() - if rows: - return [row[0] for row in rows] - else: - maybe_path = session.scalar( - select( - PathEntry, - ) - .where( - PathEntry.parent == str(path.parent), - PathEntry.name == path.name, - MetadataEntry.created_at <= (time or func.now()), - ) - .order_by( - MetadataEntry.id.desc(), - ) - ) - # Return an empty list of path exists or None if not. - return [] if maybe_path and not maybe_path.is_deleted() else None - - def read( - self, - url: str, - type: type[MetadataSubtype], - id: str | None = None, - time: datetime | None = None, - ) -> MetadataSubtype | None: - recap_url = URL(url) - path = recap_url.dialect_host_port_path - with self.Session() as session: - maybe_entry = session.scalar( - select( - MetadataEntry, - ) - .where( - MetadataEntry.path == str(path), - MetadataEntry.metadata_type == type.key(), - MetadataEntry.metadata_id == id if id else True, - MetadataEntry.created_at <= (time or func.now()), - ) - .order_by( - MetadataEntry.id.desc(), - ) - ) - if maybe_entry and not maybe_entry.is_deleted(): - return type.from_dict(maybe_entry.metadata_obj) - - def all( - self, - url: str, - type: type[MetadataSubtype], - time: datetime | None = None, - ) -> list[MetadataSubtype] | None: - recap_url = URL(url) - path = recap_url.dialect_host_port_path - return self.search(f"path = {path}", type, time) - - def search( - self, - query: str, - type: type[MetadataSubtype], - time: datetime | None = None, - ) -> list[MetadataSubtype]: - with self.Session() as session: - subquery = ( - session.query( - MetadataEntry.metadata_obj, - MetadataEntry.metadata_type, - MetadataEntry.deleted_at, - func.rank() - .over( - order_by=MetadataEntry.id.desc(), - partition_by=( - MetadataEntry.path, - MetadataEntry.metadata_type, - ), - ) - .label("rnk"), - ) - .filter( - MetadataEntry.metadata_type == type.key(), - MetadataEntry.created_at <= (time or func.now()), - # TODO Yikes. Pretty sure this is a SQL injection vulnerability. - text(query), - ) - .subquery() - ) - - query = session.query(subquery).filter( - subquery.c.rnk == 1, - subquery.c.deleted_at == None, - ) # pyright: ignore [reportGeneralTypeIssues] - - rows = session.execute(query).fetchall() - - return [type.from_dict(row[0]) for row in rows] - - -@contextmanager -def create_catalog( - url: str | None = None, - engine: dict[str, Any] = {}, - **_, -) -> Generator["DatabaseCatalog", None, None]: - if not url: - # If no URL is set, default to SQLite - url = DEFAULT_URL - # Make sure the catalog directory exists - db_path = urlparse(url).path # pyright: ignore [reportGeneralTypeIssues] - Path(db_path).parent.mkdir(parents=True, exist_ok=True) - yield DatabaseCatalog(create_engine(url, **engine)) diff --git a/recap/catalogs/recap.py b/recap/catalogs/recap.py deleted file mode 100644 index 3a909130..00000000 --- a/recap/catalogs/recap.py +++ /dev/null @@ -1,136 +0,0 @@ -from contextlib import contextmanager -from datetime import datetime -from typing import Generator - -import httpx - -from recap.metadata import Metadata, MetadataSubtype -from recap.server import DEFAULT_URL -from recap.url import URL - -from .abstract import AbstractCatalog - - -class RecapCatalog(AbstractCatalog): - """ - The Recap catalog makes HTTP requests to Recap's REST API. You can enable - RecapCatalog in your settings.toml with: - - ```toml - [catalog] - plugin = "recap" - url = "http://localhost:8000" - ``` - - The Recap catalog enables different systems to share the same metadata - when they all talk to the same Recap server. - """ - - def __init__( - self, - client: httpx.Client, - ): - self.client = client - - def add( - self, - url: str, - metadata: Metadata | None = None, - ): - encoded_url = URL(url).safe.encoded - if metadata: - response = self.client.put( - f"/catalog/{metadata.key()}/{encoded_url}", - json=metadata.to_dict(), - ) - else: - response = self.client.put(f"/catalog/urls/{encoded_url}") - response.raise_for_status() - - def read( - self, - url: str, - type: type[MetadataSubtype], - id: str | None = None, - time: datetime | None = None, - ) -> MetadataSubtype | None: - encoded_url = URL(url).safe.encoded - params = {} - if time: - params["time"] = time.isoformat() - if id: - params["id"] = id - response = self.client.get( - f"/catalog/{type.key()}/{encoded_url}", params=params - ) - if response.status_code == httpx.codes.OK: - return type.from_dict(response.json()) - if response.status_code == httpx.codes.NOT_FOUND: - return None - response.raise_for_status() - - def children( - self, - url: str, - time: datetime | None = None, - ) -> list[str] | None: - encoded_url = URL(url).safe.encoded - params = {} - if time: - params["time"] = time.isoformat() - response = self.client.get(f"/catalog/urls/{encoded_url}", params=params) - if response.status_code == httpx.codes.OK: - return response.json() - if response.status_code == httpx.codes.NOT_FOUND: - return None - response.raise_for_status() - - def all( - self, - url: str, - type: type[MetadataSubtype], - time: datetime | None = None, - ) -> list[MetadataSubtype] | None: - raise NotImplementedError - - def remove( - self, - url: str, - type: type[Metadata] | None = None, - id: str | None = None, - ): - encoded_url = URL(url).safe.encoded - if type: - params = {"id": id or None} - self.client.delete( - f"/catalog/{type.key()}/{encoded_url}", - params=params, - ).raise_for_status() - else: - self.client.delete(f"/catalog/urls/{encoded_url}").raise_for_status() - - def search( - self, - query: str, - type: type[MetadataSubtype], - time: datetime | None = None, - ) -> list[MetadataSubtype]: - params = { - "query": query, - } - if time: - params["time"] = time.isoformat() - response_list = self.client.get( - f"/catalog/{type.key()}", - params=params, - ).json() - return [type.from_dict(obj) for obj in response_list] - - -@contextmanager -def create_catalog( - url: str | None = None, - **_, -) -> Generator["RecapCatalog", None, None]: - with httpx.Client(base_url=url or DEFAULT_URL) as client: - yield RecapCatalog(client) diff --git a/recap/cli.py b/recap/cli.py index f69e29d7..adf6f143 100644 --- a/recap/cli.py +++ b/recap/cli.py @@ -1,21 +1,218 @@ +from __future__ import annotations + +from datetime import datetime + import typer +from rich import print_json + +from recap import repl +from recap.logging import setup_logging +from recap.metadata import Schema + +app = typer.Typer( + help=""" + Recap's command line interface. + """ +) + + +@app.command() +def crawl( + url: str, + args: list[str] = typer.Option( + [], + "--arg", + help="Arbitrary options (`--arg foo=bar`) passed to the crawler.", + ), +): + """ + Recursively crawl a URL and its children, storing metadata and + relationships in Recap's data catalog. + """ + + repl.crawl(url, **args_to_dict(args)) + + +@app.command() +def ls( + url: str, + args: list[str] = typer.Option( + [], + "--arg", + help="Arbitrary options (`--arg foo=bar`) passed to the catalog.", + ), + time: datetime = typer.Option( + None, + help="Time travel to see what a URL's children used to be.", + ), + refresh: bool = typer.Option( + False, + help="Skip Recap's catalog and read the latest data directly from the URL.", + ), +): + """ + List a URL's child URLs. + """ + + print_json(data=repl.ls(url, time, refresh, **args_to_dict(args))) + + +@app.command() +def readers( + url: str, + args: list[str] = typer.Option( + [], + "--arg", + help="Arbitrary options (`--arg foo=bar`) passed to the catalog.", + ), + time: datetime = typer.Option( + None, + help="Time travel to see what a URL's readers used to be.", + ), + refresh: bool = typer.Option( + False, + help="Skip Recap's catalog and read the latest data directly from the URL.", + ), +): + """ + See what reads from a URL. + """ + + print_json(data=repl.readers(url, time, refresh, **args_to_dict(args))) + + +@app.command() +def reads( + url: str, + args: list[str] = typer.Option( + [], + "--arg", + help="Arbitrary options (`--arg foo=bar`) passed to the catalog.", + ), + time: datetime = typer.Option( + None, + help="Time travel to see what a URL used to read.", + ), + refresh: bool = typer.Option( + False, + help="Skip Recap's catalog and read the latest data directly from the URL.", + ), +): + """ + See what a URL reads. URLs must be accounts or jobs. + """ + + print_json(data=repl.reads(url, time, refresh, **args_to_dict(args))) -from .logging import setup_logging -from .plugins import init_command_plugins -LOGGING_CONFIG = setup_logging() -app = init_command_plugins( - typer.Typer( - help=""" - Recap's command line interface. +@app.command() +def schema( + url: str, + args: list[str] = typer.Option( + [], + "--arg", + help="Arbitrary options (`--arg foo=bar`) passed to the catalog.", + ), + time: datetime = typer.Option( + None, + help="Time travel to see what a URL's schema used to be.", + ), + refresh: bool = typer.Option( + False, + help="Skip Recap's catalog and read the latest data directly from the URL.", + ), +): + """ + Get a Recap schema for a URL. + """ - \b - Recap's CLI is completely pluggable. See the commands below for more - information on individual command plugins. -""" + if schema := repl.schema(url, time, refresh, **args_to_dict(args)): + print_json(data=schema.dict()) + + +@app.command() +def search( + metadata_type: str, + query: str, + time: datetime = typer.Option( + None, + help="Time travel to see what a URL's metadata used to be.", + ), +): + if metadata_type == "schema": + print_json(data=[s.dict() for s in repl.search(query, Schema, time)]) + + +@app.command() +def writers( + url: str, + args: list[str] = typer.Option( + [], + "--arg", + help="Arbitrary options (`--arg foo=bar`) passed to the catalog.", + ), + time: datetime = typer.Option( + None, + help="Time travel to see what a URL's writers used to be.", + ), + refresh: bool = typer.Option( + False, + help="Skip Recap's catalog and read the latest data directly from the URL.", + ), +): + """ + See what writes to a URL. + """ + + print_json(data=repl.writers(url, time, refresh, **args_to_dict(args))) + + +@app.command() +def writes( + url: str, + args: list[str] = typer.Option( + [], + "--arg", + help="Arbitrary options (`--arg foo=bar`) passed to the catalog.", + ), + time: datetime = typer.Option( + None, + help="Time travel to see where a URL's used to write.", + ), + refresh: bool = typer.Option( + False, + help="Skip Recap's catalog and read the latest data directly from the URL.", + ), +): + """ + See what a URL writes. URLs must be accounts or jobs. + """ + + print_json(data=repl.writes(url, time, refresh, **args_to_dict(args))) + + +@app.command() +def serve(): + """ + Starts Recap's HTTP/JSON API server. + """ + + import uvicorn + + from recap.config import settings + from recap.server import fastapp + + uvicorn.run( + fastapp, + log_config=setup_logging(), + **settings.uvicorn_settings, ) -) + + +def args_to_dict(args: list[str]) -> dict[str, str]: + return dict([tuple(arg.split("=", 1)) for arg in args]) if __name__ == "__main__": + setup_logging() app() diff --git a/recap/commands/__init__.py b/recap/commands/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/recap/commands/catalog.py b/recap/commands/catalog.py deleted file mode 100644 index 15dfde98..00000000 --- a/recap/commands/catalog.py +++ /dev/null @@ -1,102 +0,0 @@ -from datetime import datetime - -import typer -from rich import print_json - -from recap import catalogs -from recap.config import settings -from recap.metadata import Metadata - -# TODO This is silly. These should be init'd in recap.__init__. -from recap.schemas.schema import Schema - -app = typer.Typer( - help=""" - Read and search the data catalog. - - Recap's `recap catalog` command reads metadata Recap's data catalog. List - the catalog's directory structure with `recap list`, read metadata from a - directory with `recap read`, and search with `recap search`. -""" -) - - -def type_from_string(type: str) -> type[Metadata] | None: - for cls in Metadata.__subclasses__(): - if cls.key() == type: - return cls - - -@app.command() -def search( - type_str: str, - query: str, - time: datetime = typer.Option( - None, - "--time", - "-t", - help="View metadata as of a point in time.", - ), -): - """ - Searches the data catalog. - - \b - Recap's search syntax depends on the catalog plugin that's used. Recap - stores its metadata in SQLite by default. You can use SQLite's json_extract - syntax to search the catalog: - - \b - recap catalog search "json_extract(metadata, '$.\"sqlalchemy.columns\".some_col') IS NOT NULL" - - \b - The database file defaults to `~/.recap/catalog/recap.db`, if you wish to - open a SQLite client directly. - """ - - with catalogs.create_catalog(**settings("catalog", {})) as c: - if type := type_from_string(type_str): - results = c.search(query, type, time) or [] - print_json( - data=[result.to_dict() for result in results], - ) - - -@app.command() -def children( - url: str = typer.Argument("/"), - time: datetime = typer.Option( - None, - "--time", - "-t", - help="List directory children for a path.", - ), -): - """ - Lists a data catalog directory's children. - """ - - with catalogs.create_catalog(**settings("catalog", {})) as c: - results = sorted(c.children(url, time) or []) - print_json(data=results) - - -@app.command() -def read( - type_str: str, - url: str, - time: datetime = typer.Option( - None, - "--time", - "-t", - help="View metadata as of a point in time.", - ), -): - """ - Prints metadata from a path in the data catalog. - """ - - with catalogs.create_catalog(**settings("catalog", {})) as c: - if type := type_from_string(type_str): - if metadata := c.read(url, type, time=time): - print_json(data=metadata.to_dict()) diff --git a/recap/commands/crawl.py b/recap/commands/crawl.py deleted file mode 100644 index ff3d8704..00000000 --- a/recap/commands/crawl.py +++ /dev/null @@ -1,92 +0,0 @@ -from typing import Optional - -import typer -from rich.progress import Progress, SpinnerColumn, TextColumn - -from recap.catalogs import create_catalog -from recap.config import settings -from recap.crawler import create_crawler - -app = typer.Typer() - - -@app.command() -def crawl( - url: Optional[str] = typer.Argument( - None, - help="URL to crawl. If unset, all URLs in settings.toml are used.", - ), - excludes: list[str] = typer.Option( - [], - "--exclude", - "-e", - help="Skip the specified analyzer when crawling.", - ), - filters: list[str] = typer.Option( - [], - "--filter", - "-f", - help="Crawl only certain paths. Format is Unix shell-style wildcards.", - ), - recursive: bool = typer.Option( - True, - help="Crawl all subdirectories recursively.", - ), -): - """ - Crawls infrastructure and writes metadata to the data catalog. - - Use `recap crawl` to crawl infrastructure and store its metadata in Recap's - catalog. The `crawl` command takes an optional `URL` parameter. If - specified, the URL will be crawled. If not specified, all `crawlers` - defined in your `settings.toml` file will be crawled. - """ - - # Make sure URL is included in crawlers if it's passed in. This is needed - # because Recap works with URLs even if no config was set for it in - # settings.toml. It is smart enough to figure out the default module to - # use. - crawlers_configs = settings("crawlers", []) - crawler_urls = set( - map( - lambda c: c.get("url"), - crawlers_configs, - ) - ) - - if url and url not in crawler_urls: - crawlers_configs.append({"url": url}) - - if excludes: - for crawler_config in crawlers_configs: - if not url or url == crawler_config["url"]: - crawler_config["excludes"] = excludes - - if filters: - for crawler_config in crawlers_configs: - if not url or url == crawler_config["url"]: - crawler_config["filters"] = filters - - if recursive: - for crawler_config in crawlers_configs: - if not url or url == crawler_config["url"]: - crawler_config["recursive"] = recursive - - with create_catalog(**settings("catalog", {})) as catalog: - for crawler_config in crawlers_configs: - if not url or url == crawler_config["url"]: - with create_crawler(catalog=catalog, **crawler_config) as crawler: - spinner = SpinnerColumn(finished_text="[green]✓") - text = TextColumn("[progress.description]{task.description}") - - with Progress(spinner, text) as progress: - # Set up the spinner description. - task_id = progress.add_task( - description=f"Crawling {url} ...", - total=1, - ) - - crawler.crawl() - - # Mark done, so we get a little green checkmark. - progress.update(task_id, completed=1) diff --git a/recap/commands/plugins.py b/recap/commands/plugins.py deleted file mode 100644 index 9a30a004..00000000 --- a/recap/commands/plugins.py +++ /dev/null @@ -1,51 +0,0 @@ -import typer -from rich import print_json - -from recap import plugins - -app = typer.Typer(help="Lists available plugins.") - - -@app.command() -def analyzers(): - """ - Lists all analyzer plugins. - """ - - print_json(data=list(plugins.load_analyzer_plugins().keys())) - - -@app.command() -def browsers(): - """ - Lists all browser plugins. - """ - - print_json(data=list(plugins.load_browser_plugins().keys())) - - -@app.command() -def catalogs(): - """ - Lists all catalog plugins. - """ - - print_json(data=list(plugins.load_catalog_plugins().keys())) - - -@app.command() -def commands(): - """ - Lists all command plugins. - """ - - print_json(data=list(plugins.load_command_plugins().keys())) - - -@app.command() -def routers(): - """ - Lists all router plugins. - """ - - print_json(data=list(plugins.load_router_plugins().keys())) diff --git a/recap/commands/serve.py b/recap/commands/serve.py deleted file mode 100644 index 3742f4d0..00000000 --- a/recap/commands/serve.py +++ /dev/null @@ -1,39 +0,0 @@ -import typer - -from recap.config import settings -from recap.logging import setup_logging -from recap.server import fastapp - -app = typer.Typer() - - -@app.command() -def serve(): - """ - Starts Recap's HTTP/JSON API server. - - \b - You might wish to centralize your Recap data catalog in one place, and have - all users read from the same location. Recap ships with an HTTP/JSON server - for this use case. Start a Recap server on a single host using - `recap serve`. - - \b - Clients can then configure their Recap catalog in `settings.toml` to point - to the server location: - - \b - ```toml - [catalog] - type = "recap" - url = "http://192.168.0.1:8000" - ``` - """ - - import uvicorn - - uvicorn.run( - fastapp, - log_config=setup_logging(), - **settings("server.uvicorn", {}), - ) diff --git a/recap/config.py b/recap/config.py index b0f20ca0..9775af76 100644 --- a/recap/config.py +++ b/recap/config.py @@ -1,15 +1,35 @@ import os from pathlib import Path +from typing import Any -from dynaconf import Dynaconf +from pydantic import BaseModel, BaseSettings -RECAP_HOME = os.path.join(Path.home(), ".recap") +# Set up RECAP_HOME +DEFAULT_RECAP_HOME = os.path.join(Path.home(), ".recap") +RECAP_HOME = os.environ.get("RECAP_HOME", DEFAULT_RECAP_HOME) Path(RECAP_HOME).mkdir(parents=True, exist_ok=True) +# Set up RECAP_SECRETS_HOME +DEFAULT_SECRET_HOME = f"{RECAP_HOME}/.secrets" +SECRET_HOME = os.environ.get("RECAP_SECRETS_HOME", DEFAULT_SECRET_HOME) +Path(SECRET_HOME).mkdir(parents=True, exist_ok=True) -settings = Dynaconf( - envvar_prefix="RECAP", - load_dotenv=True, - root_path=RECAP_HOME, - settings_files=["settings.toml", ".secrets.toml"], -) + +class StorageSettings(BaseModel): + url: str = f"sqlite:///{RECAP_HOME}/recap.db" + opts: dict[str, str] = {} + + +class Settings(BaseSettings): + storage_settings: StorageSettings = StorageSettings() + logging_config_file: str | None = None + uvicorn_settings: dict[str, Any] = {} + + class Config: + env_file = ".env" + env_file_encoding = "utf-8" + env_nested_delimiter = "__" + env_prefix = "recap_" + + +settings = Settings(_secrets_dir=SECRET_HOME) # type: ignore diff --git a/recap/crawler.py b/recap/crawler.py index 5c0f87fb..d877173c 100644 --- a/recap/crawler.py +++ b/recap/crawler.py @@ -1,195 +1,88 @@ -import fnmatch -import logging -from contextlib import contextmanager -from pathlib import PurePosixPath -from typing import Any, Generator - -from recap.browsers.analyzing import AnalyzingBrowser, create_browser -from recap.catalogs.abstract import AbstractCatalog -from recap.url import URL - -log = logging.getLogger(__name__) +from recap.registry import FunctionRegistry +from recap.registry import registry as global_registry +from recap.storage.abstract import AbstractStorage, Direction class Crawler: - """ - Recap's crawler does three things: - - 1. Browses the configured infrastructure - 2. Analyzes the infrastructure's data to generate metadata - 3. Stores the metadata in Recap's data catalog - - # Behavior - - Recap's crawler is very simple right now. The crawler recursively browses - and analyzes all children starting from an infrastructure's root location. - - !!! note - - The meaning of an infrastructure's _root_ location depends on its type. - For a database, the _root_ usually denotes a database or catalog (to - use [_information_schema_](https://en.wikipedia.org/wiki/Information_schema) - terminology). For object stores, the _root_ is usually the bucket - location. - - # Scheduling - - Recap's crawler does not have a built in scheduler or orchestrator. You can - run crawls manually with `recap crawl`, or you can schedule `recap crawl` - to run periodically using [cron](https://en.wikipedia.org/wiki/Cron), - [Airflow](https://airflow.apache.org), [Prefect](https://prefect.io), - [Dagster](https://dagster.io/), [Modal](https://modal.com), or any other - scheduler. - """ - def __init__( self, - browser: AnalyzingBrowser, - catalog: AbstractCatalog, - recursive: bool = True, - filters: list[str] = [], - **_, + storage: AbstractStorage, + registry: FunctionRegistry, ): - """ - :param browser: AnalyzingBrowser to use for listing children and - analyzing metadata. - :param catalog: The catalog where the crawler will create directories - and store metadata. - :param recursive: Whether the crawler should recurse into - subdirectories when crawling. - :param filters: Path filter to include only certain paths. Recap uses - Unix filename pattern matching as defined in Python's fnmatch - module. Filtered paths are relative to the browser (excluding the - browser's root). - """ - - self.browser = browser - self.catalog = catalog - self.recursive = recursive - self.filters = filters - self.exploded_filters = self._explode_filters(filters) - - def crawl(self): - """ - Crawl a data system and persist discovered metadata in a catalog. - """ - - root = self.browser.url.safe - log.info("Beginning crawl root=%s", root) - path_stack: list[str] = ["/"] - - while len(path_stack) > 0: - relative_path = str(path_stack.pop()) - url = str(URL(self.browser.url.safe, relative_path.lstrip("/"))) - - log.info("Crawling path=%s", relative_path) - self.catalog.add(url) - - # 1. Read and save metadata for path if filters match. - if self._matches(relative_path, self.filters): - [ - self.catalog.add(url, metadata) - for metadata in self.browser.analyze(relative_path) - ] - - # 2. Add children (that match filter) to path_stack. - children = self.browser.children(relative_path) or [] - children = [f"{relative_path.rstrip('/')}/{child}" for child in children] - filtered_children = filter( - lambda p: self._matches(str(p), self.exploded_filters), - children, - ) - if self.recursive: - path_stack.extend(filtered_children) - - # 3. Remove deleted children from catalog. - self._remove_deleted(url, children) - - log.info("Finished crawl root=%s", root) - - def _matches( - self, - relative_path: str, - filters: list[str], - ) -> bool: - """ - Check if a path matches any filters. - - :returns: True if path matches a filter or if filters is empty. - """ - - for filter in filters: - if fnmatch.fnmatch(relative_path, filter): - return True - return False if filters else True - - def _remove_deleted( - self, - url: str, - browser_children: list[str], - ): - """ - Compares the path's children in the browser vs. what is currently in - the catalog. Deletes all children that appear in the catalog, but no - longer appear in the browser. This behavior removes children that used - to exist in data infrastructure, but have been deleted since the last - crawl. - """ - - catalog_children = self.catalog.children(url) or [] - # Find catalog children that are not in the browser's children. - deleted_children = [ - catalog_child - for catalog_child in catalog_children - if catalog_child not in browser_children - ] - for child in deleted_children: - url_to_remove = str(URL(url, child)) - log.debug("Removing deleted url from catalog: %s", url_to_remove) - self.catalog.remove(url_to_remove) - - def _explode_filters(self, filters: list[str]) -> list[str]: - """ - Returns a list of paths that bread-crumb from the filter all the way - back to root. For example: - - filters=[ - '/**/schemas/my_db/tables/foo*' - ] - returns=[ - '/**', - '/**/schemas', - '/**/schemas/my_db', - '/**/schemas/my_db/tables', - '/**/schemas/my_db/tables/foo*', - ] - - We need to do this so that parents match the filter and crawling - reaches the wild-carded children. - """ - - exploded_filters = [] - for filter in filters: - fragments = filter.split("/") - partial_path = PurePosixPath("/") - for fragment in fragments: - partial_path = PurePosixPath(partial_path, fragment) - exploded_filters.append(str(partial_path)) - return exploded_filters + self.storage = storage + self.registry = registry or global_registry + + def crawl(self, url: str, **kwargs): + url_stack = [url] + + while url_stack: + url_to_crawl = url_stack.pop() + + for params, callable in self.registry.metadata_registry.items(): + if match := params.pattern.match(url_to_crawl): + try: + metadata = callable( + **params.method_args(url, **kwargs), + **match.groupdict(), + **kwargs, + ) + self.storage.write(url_to_crawl, metadata) + except: + pass + + for params, callable in self.registry.relationship_registry.items(): + if match := params.pattern.match(url_to_crawl): + try: + links = set( + callable( + **params.method_args(url, **kwargs), + **match.groupdict(), + **kwargs, + ) + ) + storage_links = set( + self.storage.links( + url_to_crawl, + params.relationship, + direction=params.direction, + ) + ) + + # Add new links + for new_url in links - storage_links: + url_from = url_to_crawl + url_to = new_url + if params.direction == Direction.TO: + url_from, url_to = url_to, url_from + self.storage.link( + url_from, + params.relationship, + url_to, + ) + + # Delete old links + for deleted_url in storage_links - links: + url_from = url_to_crawl + url_to = deleted_url + if params.direction == Direction.TO: + url_from, url_to = url_to, url_from + self.storage.unlink(url_from, params.relationship, url_to) + + url_stack.extend(links) + except: + pass + + +# Force load all recap-core integrations +# This has to be below `functions` to prevent a circular import. +# This can go away once all integrations are entry-point plugins. +import recap.integrations -@contextmanager def create_crawler( - url: str, - catalog: AbstractCatalog, - **config, -) -> Generator["Crawler", None, None]: - """ - :param url: URL to crawl. - :param catalog: Catalog to persist metadata into. - :param config: **kwargs to pass to the `create_browser` call and Crawler - constructor. - """ + url: str | None = None, registry: FunctionRegistry | None = None, **storage_opts +) -> Crawler: + from recap.storage import create_storage - with create_browser(url=url, **config) as browser: - yield Crawler(browser, catalog, **config) + storage = create_storage(url, **storage_opts) + return Crawler(storage, registry or global_registry) diff --git a/recap/integrations/__init__.py b/recap/integrations/__init__.py new file mode 100644 index 00000000..c099e708 --- /dev/null +++ b/recap/integrations/__init__.py @@ -0,0 +1,4 @@ +# Load all integrations into the registry +import recap.integrations.bigquery +import recap.integrations.fsspec +import recap.integrations.sqlalchemy diff --git a/recap/integrations/bigquery.py b/recap/integrations/bigquery.py new file mode 100644 index 00000000..9308160d --- /dev/null +++ b/recap/integrations/bigquery.py @@ -0,0 +1,299 @@ +from __future__ import annotations + +from google.cloud.bigquery import Client +from sqlalchemy import inspect +from sqlalchemy.engine import Engine + +from recap.metadata import Field, Schema +from recap.registry import registry +from recap.storage.abstract import Direction + + +@registry.relationship("bigquery://{project}", "contains", include_engine=True) +@registry.relationship( + "bigquery://{project}/{dataset}", "contains", include_engine=True +) +def ls( + engine: Engine, + project: str, + dataset: str | None = None, + **_, +) -> list[str]: + """ + List all URLs contained by a project and (optionally) a dataset. If project + is set, returned URLs will be datasets: + `bigquery://some-project-1234/some_dataset` + If dataset is set, returned URLs will be tables: + `bigquery://some-project-1234/some_dataset/some_table` + + :param engine: SQLAlchemy Engine to use when inspecting schemas and tables. + :param project: A google cloud project ID (e.g. `some-project-1234`) + :param dataset: A dataset name. + :returns: A list of dataset or table URIs. + """ + + if dataset: + tables = inspect(engine).get_table_names(dataset) + views = inspect(engine).get_view_names(dataset) + return [ + # Remove schema prefix if it's there + f"bigquery://{project}/{dataset}/{table.split('.')[-1]}" + for table in tables + views + ] + else: + return [ + f"bigquery://{project}/{dataset}" + for dataset in inspect(engine).get_schema_names() + ] + + +@registry.relationship( + "bigquery://{project}/{dataset}/{table}", "reads", Direction.TO, include_engine=True +) +def readers( + engine: Engine, + project: str, + dataset: str, + table: str, + **client_args, +) -> list[str]: + """ + Returns all accounts and jobs that read from a BigQuery table. Return URLs + will be of the form: + + bigquery://some-project?account=user:some@email.com + bigquery://some-project?job=bquxjob123456 + + A reader account signals read-access. A reader job signals that the job + read from the dataset. + + :param engine: SQLAlchemy Engine to use when inspecting schemas and tables. + :param project: A google cloud project ID (e.g. `some-project-1234`) + :param dataset: A dataset name. + :param table: A table name. + :param args: Extra arguments passed to a `google.cloud.bigquery.Client`. + :returns: A list of dataset or table URIs. + """ + + return readers_accounts(project, dataset, table, **client_args) + readers_jobs( + engine, project, dataset, table + ) + + +def readers_accounts( + project: str, + dataset: str, + table: str, + **client_args, +) -> list[str]: + client = Client(project, **client_args) + if policy := client.get_iam_policy(f"{dataset}.{table}"): + for binding in policy.bindings: + if is_reader(binding.get("role", "")): + return [ + f"bigquery://{project}?account={member}" + for member in binding.get("members", []) + ] + return [] + + +def readers_jobs( + engine: Engine, + project: str, + dataset: str, + table: str, +) -> list[str]: + # TODO Some day, maybe data plex's lineage API will work for this. + from sqlalchemy.sql import text + + # TODO Should get region for dataset, and use that in query. + with engine.connect() as connection: + results = connection.execute( + text( + """ + SELECT + job_id, + query, + destination_table, + FROM + `region-us`.INFORMATION_SCHEMA.JOBS, + UNNEST(referenced_tables) AS referenced_table + WHERE + referenced_table.project_id = :project_id + AND referenced_table.dataset_id = :dataset_id + AND referenced_table.table_id = :table_id + AND state = 'DONE' + """ + ), + project_id=project, + dataset_id=dataset, + table_id=table, + ) + + return [f"bigquery://{project}?job={row[0]}" for row in results] + + +@registry.metadata("bigquery://{project}/{dataset}/{table}", include_engine=True) +def schema( + engine: Engine, + dataset: str, + table: str, + **_, +) -> Schema: + """ + Fetch a schema from a BigQuery table. + + :param engine: SQLAlchemy Engine to use when inspecting schemas and tables. + :param project: A google cloud project ID (e.g. `some-project-1234`) + :param dataset: A dataset name. + :param table: A table name. + :returns: A Recap schema. + """ + + columns = inspect(engine).get_columns( + table, + dataset, + ) + return Schema( + fields=[ + Field( + name=column["name"], + type=str(column["type"]), + default=column["default"], + nullable=column["nullable"], + comment=column.get("comment"), + ) + for column in columns + ], + ) + + +@registry.relationship( + "bigquery://{project}/{dataset}/{table}", + "writes", + Direction.TO, + include_engine=True, +) +def writers( + engine: Engine, + project: str, + dataset: str, + table: str, + **client_args, +) -> list[str]: + """ + Returns all accounts and jobs that write to a BigQuery table. Return URLs + will be of the form: + + bigquery://some-project?account=user:some@email.com + bigquery://some-project?job=bquxjob123456 + + A writer account signals write-access. A writer job signals that the job + wrote to the dataset. + + :param engine: SQLAlchemy Engine to use when inspecting schemas and tables. + :param project: A google cloud project ID (e.g. `some-project-1234`) + :param dataset: A dataset name. + :param table: A table name. + :param client_args: Extra arguments passed to a + `google.cloud.bigquery.Client`. + :returns: A list of dataset or table URIs. + """ + + return writers_accounts(project, dataset, table, **client_args) + writers_jobs( + engine, project, dataset, table + ) + + +def writers_accounts( + project: str, + dataset: str, + table: str, + **client_args, +) -> list[str]: + client = Client(project, **client_args) + if policy := client.get_iam_policy(f"{dataset}.{table}"): + for binding in policy.bindings: + if is_writer(binding.get("role", "")): + return [ + f"bigquery://{project}?account={member}" + for member in binding.get("members", []) + ] + return [] + + +def writers_jobs( + engine: Engine, + project: str, + dataset: str, + table: str, +) -> list[str]: + # TODO Some day, maybe dataplex's lineage API will work for this. + from sqlalchemy.sql import text + + # TODO Should get region for dataset, and use that in query. + with engine.connect() as connection: + results = connection.execute( + text( + """ + SELECT + job_id, + FROM + `region-us`.INFORMATION_SCHEMA.JOBS, + UNNEST(referenced_tables) + WHERE + destination_table.project_id = :project_id + AND destination_table.dataset_id = :dataset_id + AND destination_table.table_id = :table_id + AND state = 'DONE' + """ + ), + project_id=project, + dataset_id=dataset, + table_id=table, + ) + + return [f"bigquery://{project}?job={row[0]}" for row in results] + + +""" +NOTE: This is certainly not accurate. The correct way to do this is to query +the permissions for a given role, and see if they have the +`bigquery.tables.read` permission, and so on. Unfortunately, Google's IAM API +is a bit of a mess right now, and this query doesn't appear possible in their +V2 client. + +For now, I'm just supporting the pre-defined roles. +""" + +READERS = set( + [ + "roles/owner", + "roles/editor", + "roles/viewer", + "roles/bigquery.admin", + "roles/bigquery.dataEditor", + "roles/bigquery.dataOwner", + "roles/bigquery.dataViewer", + "roles/bigquery.filteredDataViewer", + "roles/bigquerydatapolicy.maskedReader", + ] +) + +WRITERS = set( + [ + "roles/owner", + "roles/editor", + "roles/bigquery.admin", + "roles/bigquery.dataEditor", + "roles/bigquery.dataOwner", + ] +) + + +def is_reader(role: str) -> bool: + return role in READERS + + +def is_writer(role: str) -> bool: + return role in WRITERS diff --git a/recap/integrations/fsspec.py b/recap/integrations/fsspec.py new file mode 100644 index 00000000..c30ad96e --- /dev/null +++ b/recap/integrations/fsspec.py @@ -0,0 +1,93 @@ +from pathlib import PurePosixPath +from urllib.parse import urlparse + +from frictionless import Resource, describe # type: ignore +from fsspec import AbstractFileSystem + +from recap.metadata import Field, Schema +from recap.registry import registry + + +@registry.relationship( + "s3://{path:path}", "contains", include_fs=True, include_url=True +) +@registry.relationship( + "gs://{path:path}", "contains", include_fs=True, include_url=True +) +@registry.relationship( + "file:///{path:path}", "contains", include_fs=True, include_url=True +) +@registry.relationship("/{path:path}", "contains", include_fs=True, include_url=True) +def ls( + url: str, + fs: AbstractFileSystem, + path: str | None = None, +) -> list[str]: + """ + List all children in a filesystem path. Recap treats all filesystem paths + as objects (similar to S3), so each URL might contain data and/or child + URLs. + + :param url: The fully matched URL when using the function registry. + :param fs: A `fsspec` filesystem. + :param path: Filesystem path. + :returns: A list of child URLs. + """ + + scheme = urlparse(url).scheme + # Force a "file" scheme because frictionless is picky. + if scheme == "": + scheme = "file" + if scheme == "file": + path = f"/{path}" + return [ + f"{scheme}://{child['name']}" + # Force detail=True because gcsfs doesn't honor defaults. + for child in fs.ls(path or "", detail=True) + ] + + +@registry.metadata("s3://{bucket}/{path:path}", include_url=True) +@registry.metadata("file:///{path:path}", include_url=True) +@registry.metadata("/{path:path}", include_url=True) +def schema( + url: str, + path: str, + **_, +) -> Schema: + """ + Fetch a Recap schema for a URL. This method supports S3 and local + filesystems, and CSV, TSV, Parquet, and JSON filetypes. + + Recap uses `frictionless` for schema inferrence. + + :param url: The fully matched URL when using the function registry. + :param path: Path to a CSV, TSV, Parquet, or JSON file. + """ + + path_posix = PurePosixPath(path) + resource = None + + # Frictionless is picky about local file paths. + if urlparse(url).scheme == "": + url = f"file://{url}" + + match path_posix.suffix: + case (".csv" | ".tsv" | ".parquet"): + resource = describe(url) + case (".json" | ".ndjson" | ".jsonl"): + resource = describe(path=url, format="ndjson") + + if isinstance(resource, Resource): + return Schema( + fields=[ + Field( + name=field.name, + type=field.type, + ) + for field in resource.schema.fields # pyright: ignore [reportOptionalMemberAccess] + if field.name + ], + ) + + raise ValueError(f"Unsupported url={url}") diff --git a/recap/integrations/sqlalchemy.py b/recap/integrations/sqlalchemy.py new file mode 100644 index 00000000..28d4a78e --- /dev/null +++ b/recap/integrations/sqlalchemy.py @@ -0,0 +1,105 @@ +""" +This module uses INFORMATION_SCHEMA's terminology and hierarchy, which +models database hierarchy as: + + catalog -> schema -> table -> column + +If you're using PostgreSQL, "schema" is usually "public". If you're using +MySQL, ["catalog" is always hard-coded to "def"](https://dev.mysql.com/doc/mysql-infoschema-excerpt/8.0/en/information-schema-columns-table.html), +and usually excluded from MySQL connect URLs. In BigQuery, "catalog" is the +project and "schema"is the dataset. +""" + +from __future__ import annotations + +from sqlalchemy import inspect +from sqlalchemy.engine import Engine + +from recap.metadata import Field, Schema +from recap.registry import registry + + +@registry.metadata( + "postgresql://{netloc}/{database}/{schema}/{table}", include_engine=True +) +@registry.metadata( + "snowflake://{netloc}/{database}/{schema}/{table}", include_engine=True +) +def schema( + engine: Engine, + schema: str, + table: str, + **_, +) -> Schema: + """ + Fetch a Recap schema for a SQL table. + + :param engine: SQLAlchemy Engine to use when inspecting schemas and tables. + :param schema: A database schema. + :param table: A table name. + :returns: A Recap schema. + """ + + columns = inspect(engine).get_columns( + table, + schema, + ) + return Schema( + fields=[ + Field( + name=column["name"], + type=str(column["type"]), + default=column["default"], + nullable=column["nullable"], + comment=column.get("comment"), + ) + for column in columns + ], + ) + + +@registry.relationship( + "postgresql://{netloc}/{database}", "contains", include_engine=True +) +@registry.relationship( + "postgresql://{netloc}/{database}/{schema}", "contains", include_engine=True +) +@registry.relationship( + "snowflake://{netloc}/{database}", "contains", include_engine=True +) +@registry.relationship( + "snowflake://{netloc}/{database}/{schema}", "contains", include_engine=True +) +def ls( + engine: Engine, + schema: str | None = None, + **_, +) -> list[str]: + """ + Fetch (INFORMATION_SCHEMA) schemas or tables from a database. + + URLs are of the form: + + postgresql://{netloc}/{database}/{schema}/{table} + snowflake://{netloc}/{database}/{schema}/{table} + mysql://{netloc}/{schema}/{table} + bigquery://{project}/{dataset}/{table} + + :param engine: SQLAlchemy Engine to use when inspecting schemas and tables. + :param schema: A database schema. + :param table: A table name. + :returns: A Recap schema. + """ + + if schema: + tables = inspect(engine).get_table_names(schema) + views = inspect(engine).get_view_names(schema) + return [ + # Remove schema prefix if it's there + f"{engine.url}/{schema}/{table.split('.')[-1]}" + for table in tables + views + ] + else: + return [ + f"{engine.url}/{schema}" for schema in inspect(engine).get_schema_names() + ] diff --git a/recap/logging.py b/recap/logging.py index 030ccbcd..71fd4740 100644 --- a/recap/logging.py +++ b/recap/logging.py @@ -4,10 +4,7 @@ import tomli -from .config import settings - -# Config key to define a custom TOML dictConfig location. -LOGGING_CONFIG_PATH = "logging.config.path" +from recap.config import settings # Default logging config if no logging config path is set. DEFAULT_LOGGING_CONFIG = { @@ -45,9 +42,9 @@ def setup_logging() -> dict[str, Any]: """ logging_config = DEFAULT_LOGGING_CONFIG - logging_config_file_loc = settings(LOGGING_CONFIG_PATH) - if logging_config_file_loc: - logging_config_file_path = Path(logging_config_file_loc) + logging_config_file = settings.logging_config_file + if logging_config_file: + logging_config_file_path = Path(logging_config_file) logging_config_string = logging_config_file_path.read_text() logging_config = tomli.loads(logging_config_string) logging.config.dictConfig(logging_config) diff --git a/recap/metadata.py b/recap/metadata.py index 5913ec5d..4136bec2 100644 --- a/recap/metadata.py +++ b/recap/metadata.py @@ -1,42 +1,49 @@ -from __future__ import annotations +""" +This module contains the core metadata models that Recap understands. All +models extend Pydantic's `BaseModel` class. + +Right now, Recap's only metadata model is a Schema. Other entities, such as +accounts and jobs, are represented by URLs, but have no associated metadata. +""" -from abc import ABC, abstractmethod -from dataclasses import asdict, dataclass -from json import dumps, loads -from typing import Any, TypeVar +from __future__ import annotations -import dacite +from pydantic import BaseModel -@dataclass -class Metadata(ABC): - def id(self) -> str | None: - return None +class Field(BaseModel): + name: str + """ + The name of a field. + """ - @classmethod - @abstractmethod - def key(cls) -> str: - raise NotImplementedError + type: str | None = None + """ + A field's type. + """ - def to_dict(self) -> dict[str, Any]: - return asdict(self) + default: str | None = None + """ + A field's default value (represented as a string). + """ - def to_json(self) -> str: - return dumps(asdict(self)) + nullable: bool | None = None + """ + Whether the field is nullable or not. If `False`, the field is required. + """ - @classmethod - def from_dict( - cls: type[MetadataSubtype], - obj: dict[str, Any], - ) -> MetadataSubtype: - return dacite.from_dict(data_class=cls, data=obj) + comment: str | None = None + """ + A documentation comment for the field. + """ - @classmethod - def from_json( - cls: type[MetadataSubtype], - json_str: str, - ) -> MetadataSubtype: - return cls.from_dict(loads(json_str)) +class Schema(BaseModel): + """ + Recap's representation of a Schema. + """ -MetadataSubtype = TypeVar("MetadataSubtype", bound=Metadata) + fields: list[Field] + """ + Fields in the schema. + """ diff --git a/recap/plugins.py b/recap/plugins.py deleted file mode 100644 index 525b646a..00000000 --- a/recap/plugins.py +++ /dev/null @@ -1,126 +0,0 @@ -import logging -import sys -from types import ModuleType - -import typer -from fastapi import APIRouter - -log = logging.getLogger(__name__) - - -if sys.version_info < (3, 10): - from importlib_metadata import entry_points -else: - from importlib.metadata import entry_points - - -ANALYZER_PLUGIN_GROUP = "recap.analyzers" -BROWSER_PLUGIN_GROUP = "recap.browsers" -CATALOG_PLUGIN_GROUP = "recap.catalogs" -COMMAND_PLUGIN_GROUP = "recap.commands" -ROUTER_PLUGIN_GROUP = "recap.routers" - - -def load_analyzer_plugins() -> dict[str, ModuleType]: - plugins = {} - analyzer_plugins = entry_points(group=ANALYZER_PLUGIN_GROUP) - for analyzer_plugin in analyzer_plugins: - analyzer_plugin_name = analyzer_plugin.name - try: - analyzer_plugin_class = analyzer_plugin.load() - plugins[analyzer_plugin_name] = analyzer_plugin_class - except ImportError as e: - log.debug( - "Skipping analyzer=%s due to import error.", - analyzer_plugin_name, - exc_info=e, - ) - return plugins - - -def load_browser_plugins() -> dict[str, ModuleType]: - plugins = {} - browser_plugins = entry_points(group=BROWSER_PLUGIN_GROUP) - for browser_plugin in browser_plugins: - browser_plugin_name = browser_plugin.name - try: - browser_plugin_class = browser_plugin.load() - plugins[browser_plugin_name] = browser_plugin_class - except ImportError as e: - log.debug( - "Skipping browser=%s due to import error.", - browser_plugin_name, - exc_info=e, - ) - return plugins - - -def load_catalog_plugins() -> dict[str, ModuleType]: - plugins = {} - catalog_plugins = entry_points(group=CATALOG_PLUGIN_GROUP) - for catalog_plugin in catalog_plugins: - catalog_plugin_name = catalog_plugin.name - try: - catalog_plugin_class = catalog_plugin.load() - plugins[catalog_plugin_name] = catalog_plugin_class - except ImportError as e: - log.debug( - "Skipping catalog=%s due to import error.", - catalog_plugin_name, - exc_info=e, - ) - return plugins - - -def load_command_plugins() -> dict[str, typer.Typer]: - plugins = {} - command_plugins = entry_points(group=COMMAND_PLUGIN_GROUP) - for command_plugin in command_plugins: - command_plugin_name = command_plugin.name - try: - command_plugin_instance = command_plugin.load() - plugins[command_plugin_name] = command_plugin_instance - except ImportError as e: - log.debug( - "Skipping command=%s due to import error.", - command_plugin_name, - exc_info=e, - ) - return plugins - - -def load_router_plugins() -> dict[str, APIRouter]: - plugins = {} - router_plugins = entry_points(group=ROUTER_PLUGIN_GROUP) - for router_plugin in router_plugins: - router_plugin_name = router_plugin.name - try: - router_plugin_instance = router_plugin.load() - plugins[router_plugin_name] = router_plugin_instance - except ImportError as e: - log.debug( - "Skipping router=%s due to import error.", - router_plugin_name, - exc_info=e, - ) - return plugins - - -def init_command_plugins(app: typer.Typer) -> typer.Typer: - plugins = load_command_plugins() - - for command_plugin_name, command_plugin in plugins.items(): - # If the plugin has a single command, then put it directly into the - # current Typer app. If the plugin has multiple commands, then - # treat it as a command group, and add it as such. - # TODO Shouldn't need to do this, but Typer has a bug. - # https://github.com/tiangolo/typer/issues/119 - if len(command_plugin.registered_commands) == 1: - callback = command_plugin.registered_commands[0].callback - app.command(command_plugin_name)( - callback # pyright: ignore [reportGeneralTypeIssues] - ) - else: - app.add_typer(command_plugin, name=command_plugin_name) - - return app diff --git a/recap/registry.py b/recap/registry.py new file mode 100644 index 00000000..485e3183 --- /dev/null +++ b/recap/registry.py @@ -0,0 +1,147 @@ +from dataclasses import dataclass +from pathlib import PurePath +from re import Pattern +from typing import Any, Callable +from urllib.parse import urlparse + +from fsspec import AbstractFileSystem, get_fs_token_paths +from pandas import DataFrame, read_csv, read_json, read_parquet, read_sql_table +from pydantic import BaseModel +from sqlalchemy import create_engine +from sqlalchemy.engine import Engine +from starlette.routing import compile_path + +from recap.storage.abstract import Direction + + +@dataclass(frozen=True, kw_only=True) +class MethodArgsParams: + pattern: Pattern + include_engine: bool = False + include_fs: bool = False + include_url: bool = False + + def method_args(self, url: str, **kwargs) -> dict[str, Any]: + method_args = {} + if self.include_engine: + method_args["engine"] = self.engine(url, **kwargs) + if self.include_fs: + method_args["fs"] = self.fs(url, **kwargs) + if self.include_url: + method_args["url"] = url + return method_args + + def engine(self, url: str, **engine_args) -> Engine: + parsed_url = urlparse(url) + match parsed_url.scheme, parsed_url.netloc, parsed_url.path.split("/"): + case "bigquery", project, _: + return create_engine(f"bigquery://{project}", **engine_args) + case str(scheme), str(netloc), path: + connect_url = f"{scheme}://{netloc}" + if len(path) > 1: + connect_url += f"/{path[1]}" + return create_engine(connect_url, **engine_args) + raise ValueError(f"Unable to create engine for url={url}") + + def fs(self, url: str, **storage_options) -> AbstractFileSystem: + fs, _, _ = get_fs_token_paths(url, storage_options=storage_options) + return fs + + +@dataclass(frozen=True) +class MetadataParams(MethodArgsParams): + include_df: bool = False + + def method_args(self, url: str, **kwargs) -> dict[str, Any]: + method_args = super().method_args(url, **kwargs) + if self.include_df: + method_args["df"] = self.df(url, **kwargs) + return method_args + + def df(self, url: str, **kwargs) -> DataFrame: + try: + engine = self.engine(url, **kwargs) + with engine.connect() as connection: + table = PurePath(url).parts[-1] + read_sql_table(table, connection) + except: + pass + try: + match PurePath(urlparse(url).path).suffix: + case ".json" | ".jsonl" | ".ndjson": + return read_json(url, **kwargs) + case ".csv" | ".tsv": + return read_csv(url, **kwargs) + case ".parquet": + return read_parquet(url, **kwargs) + except: + pass + raise ValueError(f"Unable to create data frame for url={url}") + + +@dataclass(frozen=True) +class RelationshipParams(MethodArgsParams): + relationship: str + direction: Direction = Direction.FROM + + +class FunctionRegistry: + def __init__(self): + self.relationship_registry: dict[ + RelationshipParams, Callable[..., list[str]] + ] = {} + self.metadata_registry: dict[MetadataParams, Callable[..., BaseModel]] = {} + + def metadata( + self, + pattern: str, + include_df: bool = False, + include_engine: bool = False, + include_fs: bool = False, + include_url: bool = False, + ): + def inner( + callable: Callable[..., BaseModel], + ) -> Callable[..., BaseModel]: + pattern_regex, _, _ = compile_path(pattern) + params = MetadataParams( + pattern=pattern_regex, + include_df=include_df, + include_fs=include_fs, + include_engine=include_engine, + include_url=include_url, + ) + self.metadata_registry[params] = callable + return callable + + return inner + + def relationship( + self, + pattern: str, + type: str, + direction: Direction = Direction.FROM, + include_engine: bool = False, + include_fs: bool = False, + include_url: bool = False, + ): + pattern_regex, _, _ = compile_path(pattern) + params = RelationshipParams( + pattern=pattern_regex, + relationship=type, + direction=direction, + include_fs=include_fs, + include_engine=include_engine, + include_url=include_url, + ) + + def inner( + callable: Callable[..., list[str]], + ) -> Callable[..., list[str]]: + self.relationship_registry[params] = callable + return callable + + return inner + + +registry = FunctionRegistry() diff --git a/recap/repl.py b/recap/repl.py new file mode 100644 index 00000000..12de37fe --- /dev/null +++ b/recap/repl.py @@ -0,0 +1,167 @@ +""" +Recap's REPL functions are a convenient way to interact with Recap in Python. +The functions behave similarly to Recap's [CLI](cli.md), but return Python +objects. +""" + + +import logging +from datetime import datetime + +from recap.catalog import create_catalog +from recap.crawler import create_crawler +from recap.metadata import Schema +from recap.storage.abstract import MetadataSubtype + +catalog = create_catalog() +log = logging.getLogger(__name__) + + +def crawl(url: str, **kwargs): + """ + Recursively crawl a URL and its children, storing metadata and + relationships in Recap's data catalog. + """ + + create_crawler().crawl(url, **kwargs) + + +def ls( + url: str, + time: datetime | None = None, + refresh: bool = False, + **kwargs, +) -> list[str] | None: + """ + List a URL's child URLs. + + :param url: URL to list children for. + :param time: Time travel to see what a URL's children used to be. + :param refresh: Skip Recap's catalog and read the latest data directly from + the URL. + :param kwargs: Arbitrary options passed to the crawler. + :returns: A list of the input URL's children. + """ + + return catalog.ls(url, time, refresh, **kwargs) + + +def readers( + url: str, + time: datetime | None = None, + refresh: bool = False, + **kwargs, +) -> list[str] | None: + """ + See what reads from a URL. + + :param url: URL to fetch readers for. + :param time: Time travel to see what a URL's readers used to be. + :param refresh: Skip Recap's catalog and read the latest data directly from + the URL. + :param kwargs: Arbitrary options passed to the catalog. + :returns: A list URLs that read the input URL. + """ + + return catalog.readers(url, time, refresh, **kwargs) + + +def reads( + url: str, + time: datetime | None = None, + refresh: bool = False, + **kwargs, +) -> list[str] | None: + """ + See what a URL reads. URLs must be accounts or jobs. + + :param url: URL to fetch readers for. + :param time: Time travel to see what a URL used to read. + :param refresh: Skip Recap's catalog and read the latest data directly from + the URL. + :param kwargs: Arbitrary options passed to the catalog. + :returns: A list URLs that the input URL reads. + """ + + return catalog.reads(url, time, refresh, **kwargs) + + +def schema( + url: str, + time: datetime | None = None, + refresh: bool = False, + **kwargs, +) -> Schema | None: + """ + Get a Recap schema for a URL. + + :param url: URL to fetch a schema for. + :param time: Time travel to see what a URL's schema used to be. + :param refresh: Skip Recap's catalog and read the latest data directly from + the URL. + :param kwargs: Arbitrary options passed to the catalog. + :returns: A list URLs that the input URL reads. + """ + + return catalog.schema(url, time, refresh, **kwargs) + + +def search( + query: str, + metadata_type: type[MetadataSubtype], + time: datetime | None = None, +) -> list[MetadataSubtype]: + """ + Get a Recap schema for a URL. + + :param query: Query to apply to the storage layer. This is usually a + `WHERE` clause, for example: + `"url='file:///tmp/recap-test/foo/foo.json'"` + or + `"json_extract(metadata_obj, '$.fields.name') = 'email'"` + :param metadata_type: The type of metadata to search for. + :param time: Time travel to see what a URL's metadata used to be. + :returns: A list metadata documents that match the search. + """ + + return catalog.search(query, metadata_type, time) + + +def writers( + url: str, + time: datetime | None = None, + refresh: bool = False, + **kwargs, +) -> list[str] | None: + """ + See what writes to a URL. + + :param url: URL to fetch writers for. + :param time: Time travel to see what a URL's writers used to be. + :param refresh: Skip Recap's catalog and read the latest data directly from + the URL. + :param kwargs: Arbitrary options passed to the catalog. + :returns: A list URLs that write to the input URL. + """ + + return catalog.writers(url, time, refresh, **kwargs) + + +def writes( + url: str, + time: datetime | None = None, + refresh: bool = False, + **kwargs, +) -> list[str] | None: + """ + See what a URL writes. URLs must be accounts or jobs. + + :param url: URL to fetch writers for. + :param time: Time travel to see where a URL's used to write. + :param refresh: Skip Recap's catalog and read the latest data directly from + the URL. + :param kwargs: Arbitrary options passed to the catalog. + :returns: A list URLs that the input URL writes to. + """ + + return catalog.writes(url, time, refresh, **kwargs) diff --git a/recap/routers/__init__.py b/recap/routers/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/recap/routers/catalog.py b/recap/routers/catalog.py deleted file mode 100644 index ba23be62..00000000 --- a/recap/routers/catalog.py +++ /dev/null @@ -1,77 +0,0 @@ -from datetime import datetime -from typing import Any - -from fastapi import APIRouter, Depends, HTTPException - -from recap.catalogs.abstract import AbstractCatalog -from recap.schemas.schema import Schema -from recap.server import get_catalog - -router = APIRouter( - prefix="/catalog", -) - - -@router.get("/urls/{url:path}") -def get_children( - url: str, - time: datetime | None = None, - catalog: AbstractCatalog = Depends(get_catalog), -) -> list[str]: - children = catalog.children(url, time) - if children is not None: - return children - raise HTTPException(status_code=404) - - -@router.put("/urls/{url:path}") -def put_url( - url: str, - catalog: AbstractCatalog = Depends(get_catalog), -): - catalog.add(url) - - -@router.delete("/urls/{url:path}") -def delete_url( - url: str, - catalog: AbstractCatalog = Depends(get_catalog), -): - return catalog.remove(url) - - -@router.put("/schemas/{url:path}") -def write_schema( - url: str, - schema: Schema, - catalog: AbstractCatalog = Depends(get_catalog), -): - catalog.add(url, schema) - - -@router.get("/schemas/{url:path}") -def get_schema( - url: str, - time: datetime | None = None, - catalog: AbstractCatalog = Depends(get_catalog), -) -> Schema: - if metadata := catalog.read(url, Schema, time=time): - return metadata - raise HTTPException(status_code=404) - - -@router.delete("/schemas/{url:path}") -def delete_schema( - url: str, - catalog: AbstractCatalog = Depends(get_catalog), -): - catalog.remove(url, Schema) - - -@router.get("/schemas") -def search_schema( - query: str, - time: datetime | None = None, - catalog: AbstractCatalog = Depends(get_catalog), -) -> list[Schema]: - return catalog.search(query, Schema, time) diff --git a/recap/schemas/__init__.py b/recap/schemas/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/recap/schemas/schema.py b/recap/schemas/schema.py deleted file mode 100644 index 521c4817..00000000 --- a/recap/schemas/schema.py +++ /dev/null @@ -1,23 +0,0 @@ -from __future__ import annotations - -from dataclasses import dataclass - -from recap.metadata import Metadata - - -@dataclass -class Schema(Metadata): - fields: list[Field] - - @classmethod - def key(cls) -> str: - return "schemas" - - -@dataclass -class Field: - name: str - type: str | None = None - default: str | None = None - nullable: bool | None = None - comment: str | None = None diff --git a/recap/server.py b/recap/server.py index 63631db8..e373ae7d 100644 --- a/recap/server.py +++ b/recap/server.py @@ -1,27 +1,87 @@ +from datetime import datetime from typing import Generator -from fastapi import FastAPI +from fastapi import APIRouter, Depends, FastAPI, HTTPException -from recap.catalogs.abstract import AbstractCatalog -from recap.config import settings +from recap.metadata import Schema +from recap.storage import create_storage +from recap.storage.abstract import AbstractStorage, Direction -from . import catalogs, plugins +storage_router = APIRouter( + prefix="/storage", +) -DEFAULT_URL = "http://localhost:8000" +def get_storage() -> Generator[AbstractStorage, None, None]: + yield create_storage() -fastapp = FastAPI() + +@storage_router.get("/{url:path}/metadata/schema") +def metadata( + url: str, + time: datetime | None = None, + storage: AbstractStorage = Depends(get_storage), +) -> Schema: + if schema := storage.metadata(url, Schema, time): + return schema + raise HTTPException(status_code=404) + + +@storage_router.get("/{url:path}/links/{relationship}") +def links( + url: str, + relationship: str, + time: datetime | None = None, + direction_type: str | None = None, + storage: AbstractStorage = Depends(get_storage), +) -> list[str]: + direction = ( + Direction.FROM + if not direction_type or direction_type.lower() == "from" + else Direction.TO + ) + if links := storage.links(url, relationship, time, direction): + return links + raise HTTPException(status_code=404) + + +@storage_router.get("/search/schema") +def search( + query: str, + time: datetime | None = None, + storage: AbstractStorage = Depends(get_storage), +) -> list[Schema]: + return storage.search(query, Schema, time) -def get_catalog() -> Generator[AbstractCatalog, None, None]: - with catalogs.create_catalog(**settings("catalog", {})) as c: - yield c +@storage_router.put("/{url:path}/metadata/schema") +def write( + url: str, + schema: Schema, + storage: AbstractStorage = Depends(get_storage), +): + storage.write(url, schema) -@fastapp.on_event("startup") -def load_plugins(): - allowed_plugins = settings("server.plugins", []) - router_plugins = plugins.load_router_plugins() - for plugin_name, plugin_router in router_plugins.items(): - if not allowed_plugins or plugin_name in allowed_plugins: - fastapp.include_router(plugin_router) +@storage_router.post("/{url:path}/links/{relationship}") +def link( + url: str, + relationship: str, + other_url: str, + storage: AbstractStorage = Depends(get_storage), +): + storage.link(url, relationship, other_url) + + +@storage_router.delete("/{url:path}/links/{relationship}") +def unlink( + url: str, + relationship: str, + other_url: str, + storage: AbstractStorage = Depends(get_storage), +): + storage.unlink(url, relationship, other_url) + + +fastapp = FastAPI() +fastapp.include_router(storage_router) diff --git a/recap/storage/__init__.py b/recap/storage/__init__.py new file mode 100644 index 00000000..38874907 --- /dev/null +++ b/recap/storage/__init__.py @@ -0,0 +1,16 @@ +from recap.storage.abstract import AbstractStorage + + +def create_storage(url: str | None = None, **storage_opts) -> AbstractStorage: + from recap.config import settings + from recap.storage.db import DatabaseStorage + from recap.storage.remote import RemoteStorage + + storage_settings = settings.storage_settings + combined_opts = storage_settings.opts | storage_opts + url = url or storage_settings.url + + if url.startswith("http"): + return RemoteStorage(url, **combined_opts) + + return DatabaseStorage(url, **combined_opts) diff --git a/recap/storage/abstract.py b/recap/storage/abstract.py new file mode 100644 index 00000000..4f979edf --- /dev/null +++ b/recap/storage/abstract.py @@ -0,0 +1,136 @@ +from abc import ABC, abstractmethod +from datetime import datetime +from enum import Enum +from typing import TypeVar + +from pydantic import BaseModel + +MetadataSubtype = TypeVar("MetadataSubtype", bound=BaseModel) +""" +A templated type that extends Pydantic's `BaseModel` class. +""" + + +class Direction(Enum): + """ + Edge direction between two nodes. + """ + + FROM = 1 + TO = 2 + + +class AbstractStorage(ABC): + """ + An abstract representation of Recap's storage layer. + + Recap's storage layer provides graph, search, and time travel capabilities. + Nodes are represented by URLs. A URL can be linked to another with a + relationship type. Metadata can be attached to a node. + """ + + @abstractmethod + def metadata( + self, + url: str, + metadata_type: type[MetadataSubtype], + time: datetime | None = None, + ) -> MetadataSubtype | None: + """ + Read a type of metadata for a URL. + + :param url: + :param metadata_type: + :param time: + :returns: + """ + + raise NotImplementedError + + @abstractmethod + def links( + self, + url: str, + relationship: str, + time: datetime | None = None, + direction: Direction = Direction.FROM, + ) -> list[str]: + """ + Read graph edges (links) for a URL. + + :param url: + :param relationship: + :param time: + :param direction: + :returns: + """ + + raise NotImplementedError + + @abstractmethod + def search( + self, + query: str, + metadata_type: type[MetadataSubtype], + time: datetime | None = None, + ) -> list[MetadataSubtype]: + """ + Search for metadata. + + :param query: + :param metadata_type: + :param time: + :returns: + """ + + raise NotImplementedError + + @abstractmethod + def write( + self, + url: str, + metadata: BaseModel, + ): + """ + (Over)write a type of metadata for a URL. + + :param url: + :param metadata: + """ + + raise NotImplementedError + + @abstractmethod + def link( + self, + url: str, + relationship: str, + other_url: str, + ): + """ + Connect two URLs in the graph with relationship type. + + :param url: + :param relationship: + :param other_url: + """ + + raise NotImplementedError + + @abstractmethod + def unlink( + self, + url: str, + relationship: str, + other_url: str, + ): + """ + Disconnect two URLs in the graph with relationship type. + + :param url: + :param relationship: + :param other_url: + :returns: + """ + + raise NotImplementedError diff --git a/recap/storage/db.py b/recap/storage/db.py new file mode 100644 index 00000000..2f55a7f3 --- /dev/null +++ b/recap/storage/db.py @@ -0,0 +1,248 @@ +from datetime import datetime + +from pydantic import BaseModel +from sqlalchemy import Column, DateTime, Index, create_engine, select +from sqlalchemy.dialects.postgresql import JSONB +from sqlalchemy.ext.declarative import declarative_base +from sqlalchemy.orm import sessionmaker +from sqlalchemy.schema import Sequence +from sqlalchemy.sql import func, text +from sqlalchemy.types import JSON, BigInteger, Integer, String + +from recap.storage.abstract import AbstractStorage, Direction, MetadataSubtype + +Base = declarative_base() + + +class MetadataEntry(Base): + __tablename__ = "metadata" + + # Sequence instead of autoincrement="auto" for DuckDB compatibility + metadata_id_seq = Sequence("metadata_id_seq") + id = Column( + # Use Integer with SQLite since it's suggested by SQLalchemy + BigInteger().with_variant(Integer, "sqlite"), + metadata_id_seq, + primary_key=True, + ) + url = Column(String(65535), nullable=False) + metadata_type = Column(String(255), nullable=False) + metadata_obj = Column( + JSON().with_variant(JSONB, "postgresql"), + nullable=False, + ) + created_at = Column( + DateTime, + nullable=False, + server_default=func.now(), + index=True, + ) + deleted_at = Column(DateTime) + + __table_args__ = ( + Index( + "url_type_idx", + url, + metadata_type, + ), + ) + + def is_deleted(self) -> bool: + return self.deleted_at is not None + + +class LinkEntry(Base): + __tablename__ = "links" + + # Sequence instead of autoincrement="auto" for DuckDB compatibility + link_id_seq = Sequence("link_id_seq") + id = Column( + # Use Integer with SQLite since it's suggested by SQLalchemy + BigInteger().with_variant(Integer, "sqlite"), + link_id_seq, + primary_key=True, + ) + url_from = Column(String(65535), nullable=False) + url_to = Column(String(65535), nullable=False) + relationship_type = Column(String(255), nullable=False) + created_at = Column( + DateTime, + nullable=False, + server_default=func.now(), + index=True, + ) + deleted_at = Column(DateTime) + + __table_args__ = ( + Index( + "from_to_type_idx", + url_from, + url_to, + relationship_type, + ), + ) + + def is_deleted(self) -> bool: + return self.deleted_at is not None + + +class DatabaseStorage(AbstractStorage): + def __init__(self, url: str, **engine_opts): + self.engine = create_engine(url, **engine_opts) + Base.metadata.create_all(self.engine) + self.Session = sessionmaker(self.engine) + + def metadata( + self, + url: str, + metadata_type: type[MetadataSubtype], + time: datetime | None = None, + ) -> MetadataSubtype | None: + with self.Session() as session, session.begin(): + maybe_entry = session.scalar( + select( + MetadataEntry, + ) + .where( + MetadataEntry.url == url, + MetadataEntry.metadata_type == metadata_type.__name__.lower(), + MetadataEntry.created_at <= (time or func.now()), + ) + .order_by( + MetadataEntry.id.desc(), + ) + ) + if metadata := maybe_entry: + if not metadata.is_deleted(): + return metadata_type.parse_obj(metadata.metadata_obj) + + def links( + self, + url: str, + relationship: str, + time: datetime | None = None, + direction: Direction = Direction.FROM, + ) -> list[str]: + with self.Session() as session, session.begin(): + subquery = ( + session.query( + LinkEntry, + func.rank() + .over( + order_by=LinkEntry.id.desc(), + partition_by=( + LinkEntry.url_from, + LinkEntry.url_to, + LinkEntry.relationship_type, + ), + ) + .label("rnk"), + ) + .filter( + LinkEntry.url_from == url + if direction == Direction.FROM + else LinkEntry.url_to == url, + LinkEntry.relationship_type == relationship.lower(), + LinkEntry.created_at <= (time or func.now()), + ) + .subquery() + ) + query = session.query(subquery).filter( + subquery.c.rnk == 1, + subquery.c.deleted_at == None, + ) + rows = session.execute(query).fetchall() + return [row[2] if direction == Direction.FROM else row[1] for row in rows] + + def search( + self, + query: str, + metadata_type: type[MetadataSubtype], + time: datetime | None = None, + ) -> list[MetadataSubtype]: + with self.Session() as session: + subquery = ( + session.query( + MetadataEntry.metadata_obj, + MetadataEntry.metadata_type, + MetadataEntry.deleted_at, + func.rank() + .over( + order_by=MetadataEntry.id.desc(), + partition_by=( + MetadataEntry.url, + MetadataEntry.metadata_type, + ), + ) + .label("rnk"), + ) + .filter( + MetadataEntry.metadata_type == str(metadata_type.__name__).lower(), + MetadataEntry.created_at <= (time or func.now()), + # TODO Yikes. Pretty sure this is a SQL injection vulnerability. + text(query), + ) + .subquery() + ) + + query = session.query(subquery).filter( + subquery.c.rnk == 1, + subquery.c.deleted_at == None, + ) # pyright: ignore [reportGeneralTypeIssues] + + rows = session.execute(query).fetchall() + + return [metadata_type.parse_obj(row[0]) for row in rows] + + def write( + self, + url: str, + metadata: BaseModel, + ): + with self.Session() as session, session.begin(): + session.add( + MetadataEntry( + url=url, + metadata_type=str(type(metadata).__name__).lower(), + metadata_obj=metadata.dict(), + ) + ) + + def link( + self, + url: str, + relationship: str, + other_url: str, + ): + with self.Session() as session, session.begin(): + session.add( + LinkEntry( + url_from=url, + url_to=other_url, + relationship_type=relationship.lower(), + ) + ) + + def unlink( + self, + url: str, + relationship: str, + other_url: str, + ): + with self.Session() as session, session.begin(): + maybe_entry = session.scalar( + select( + LinkEntry, + ) + .where( + LinkEntry.url_from == url, + LinkEntry.url_to == other_url, + LinkEntry.relationship_type == relationship.lower(), + ) + .order_by( + LinkEntry.id.desc(), + ) + ) + if link := maybe_entry: + link.deleted_at = datetime.now() + session.commit() diff --git a/recap/storage/remote.py b/recap/storage/remote.py new file mode 100644 index 00000000..b1d82009 --- /dev/null +++ b/recap/storage/remote.py @@ -0,0 +1,120 @@ +from datetime import datetime +from urllib.parse import quote_plus + +from httpx import Client, codes +from pydantic import BaseModel + +from recap.storage.abstract import AbstractStorage, Direction, MetadataSubtype + + +class RemoteStorage(AbstractStorage): + def __init__(self, base_url: str, **httpx_opts): + self.client = Client(base_url=base_url, **httpx_opts) + + def metadata( + self, + url: str, + metadata_type: type[MetadataSubtype], + time: datetime | None = None, + ) -> MetadataSubtype | None: + params: dict[str, str] = {} + if time: + params["time"] = time.isoformat() + response = self.client.get( + f"/{quote_plus(url)}/metadata/{metadata_type.__name__.lower()}", + params=params, + ) + if response.status_code == codes.OK: + match metadata_type: + case Schema: + return Schema.parse_obj(response.json()) + elif response.status_code == codes.NOT_FOUND: + return None + response.raise_for_status() + + def links( + self, + url: str, + relationship: str, + time: datetime | None = None, + direction: Direction = Direction.FROM, + ) -> list[str]: # type: ignore + params: dict[str, str] = { + "direction_type": direction.name.lower(), + } + if time: + params["time"] = time.isoformat() + response = self.client.get( + f"/{quote_plus(url)}/links/{relationship}", params=params + ) + match response.status_code: + case codes.OK: + return response.json() + case codes.NOT_FOUND: + return [] + case _: + response.raise_for_status() + + def search( + self, + query: str, + metadata_type: type[MetadataSubtype], + time: datetime | None = None, + ) -> list[MetadataSubtype]: + params = { + "query": query, + } + if time: + params["time"] = time.isoformat() + response = self.client.get( + f"/search/{metadata_type.__name__.lower()}", params=params + ) + if response.status_code == codes.OK: + match metadata_type: + case Schema: + return [Schema.parse_obj(obj) for obj in response.json()] + elif response.status_code == codes.NOT_FOUND: + return [] + response.raise_for_status() + raise Exception("Unexpected: status not OK, but no exception.") + + def write( + self, + url: str, + metadata: BaseModel, + ): + response = self.client.put( + f"/{quote_plus(url)}/metadata/{type(metadata).__name__.lower()}", + json=metadata.dict(), + ) + response.raise_for_status() + + def link( + self, + url: str, + relationship: str, + other_url: str, + ): + params = { + "other_url": other_url, + } + response = self.client.post( + f"/{quote_plus(url)}/links/{relationship}", + params=params, + ) + response.raise_for_status() + + def unlink( + self, + url: str, + relationship: str, + other_url: str, + ): + params = { + "other_url": other_url, + } + response = self.client.delete( + f"/{quote_plus(url)}/links/{relationship}", + params=params, + ) + response.raise_for_status() diff --git a/recap/url.py b/recap/url.py deleted file mode 100644 index 92da81af..00000000 --- a/recap/url.py +++ /dev/null @@ -1,80 +0,0 @@ -from __future__ import annotations - -from functools import cached_property -from pathlib import PurePosixPath -from urllib.parse import quote, urlparse - - -class URL: - def __init__( - self, - url: str | URL, - subpath: str | None = None, - ): - self.url = URL.insert_subpath(url, subpath) if subpath else str(url) - self.parsed_url = urlparse(str(self.url)) - - @cached_property - def dialect(self) -> str: - if scheme := self.parsed_url.scheme: - return scheme.split("+")[0] - return "file" - - @cached_property - def driver(self) -> str | None: - if scheme := self.parsed_url.scheme: - return scheme.split("+")[-1] - - @cached_property - def path(self) -> str | None: - return self.parsed_url.path or None - - @cached_property - def path_posix(self) -> PurePosixPath | None: - if path := self.path: - return PurePosixPath(path) - - @cached_property - def host_port(self) -> str | None: - if netloc := self.parsed_url.netloc: - return netloc.split("@")[-1] - - @cached_property - def host_port_path(self) -> PurePosixPath | None: - host_port = self.host_port or "" - path = self.parsed_url.path.lstrip("/") - if host_port or path: - return PurePosixPath(host_port, path) - - @cached_property - def dialect_host_port_path(self) -> PurePosixPath: - host_port_path = self.host_port_path or PurePosixPath("/") - return PurePosixPath(self.dialect) / str(host_port_path).lstrip("/") - - @staticmethod - def insert_subpath(url: str | URL, subpath: str) -> str: - scheme, netloc, path, params, query, fragment = urlparse(str(url)) - path = PurePosixPath("/") / path / subpath.lstrip("/") - if scheme: - scheme = f"{scheme}://" - if params: - params = f";{params}" - if query: - query = f"?{query}" - if fragment: - fragment = f"#{fragment}" - return f"{scheme}{netloc}{path}{params}{query}{fragment}" - - @cached_property - def safe(self) -> URL: - scheme = self.dialect - netloc = self.host_port or "" - path = self.path or "" - return URL(f"{scheme}://{netloc}{path}") - - @cached_property - def encoded(self) -> str: - return quote(str(self)) - - def __str__(self) -> str: - return self.url diff --git a/tests/catalogs/test_db.py b/tests/catalogs/test_db.py deleted file mode 100644 index 9cdb86b0..00000000 --- a/tests/catalogs/test_db.py +++ /dev/null @@ -1,153 +0,0 @@ -# pylint: disable=missing-function-docstring -# pylint: disable=missing-class-docstring -# pylint: disable=missing-module-docstring - -from dataclasses import dataclass -from datetime import datetime - -import pytest -from sqlalchemy import create_engine - -from recap.catalogs.db import DatabaseCatalog, MetadataEntry, PathEntry -from recap.metadata import Metadata - - -@dataclass -class TestMetadata(Metadata): - field1: str - field2: int | None - - @classmethod - def key(cls): - return "test" - - -class TestMetadataEntry: - def test_is_deleted(self): - entry = MetadataEntry(deleted_at=datetime.now()) - assert entry.is_deleted() is True - - def test_not_deleted(self): - entry = MetadataEntry(deleted_at=None) - assert entry.is_deleted() is False - - -class TestPathEntry: - def test_is_deleted(self): - entry = PathEntry(deleted_at=datetime.now()) - assert entry.is_deleted() is True - - def test_not_deleted(self): - entry = PathEntry(deleted_at=None) - assert entry.is_deleted() is False - - -class TestDatabaseCatalog: - @pytest.fixture - def engine(self): - return create_engine("sqlite:///:memory:") - - @pytest.fixture - def catalog(self, engine): - return DatabaseCatalog(engine) - - @pytest.fixture - def metadata(self): - return TestMetadata(field1="test", field2=123) - - def test_catalog_children_doesnt_exist(self, catalog): - parent = "postgresql://localhost/some_db/some_schema" - child = f"{parent}/some_table" - - catalog.add(child) - assert catalog.children(parent) == ["some_table"] - - def test_catalog_children_does_exist(self, catalog): - parent = "postgresql://localhost/some_db/some_schema" - child = f"{parent}/some_table" - - catalog.add(child) - assert catalog.children(parent) == ["some_table"] - - catalog.add(child) - assert catalog.children(parent) == ["some_table"] - - def test_catalog_add_deleted_path(self, catalog): - parent = "postgresql://localhost/some_db/some_schema" - child = f"{parent}/some_table" - - catalog.add(child) - assert catalog.children(parent) == ["some_table"] - - catalog.remove(child) - assert catalog.children(parent) is None - - catalog.add(child) - assert catalog.children(parent) == ["some_table"] - - def test_write(self, catalog, metadata): - url = "bigquery://some-project-1234/some_dataset/some_table" - - assert catalog.read(url, type(metadata)) is None - catalog.add(url, metadata) - assert catalog.read(url, type(metadata)) == metadata - - def test_write_metadata_after_rm(self, catalog, metadata): - url = "bigquery://some-project-1234/some_dataset/some_table" - - assert catalog.read(url, type(metadata)) is None - - catalog.add(url, metadata) - assert catalog.read(url, type(metadata)) == metadata - - catalog.remove(url, type(metadata)) - assert catalog.read(url, type(metadata)) is None - - def test_rm_path(self, catalog, metadata): - parent = "bigquery://some-project-1234/some_dataset" - child = f"{parent}/some_table" - - assert catalog.read(child, type(metadata)) is None - assert catalog.children(parent) is None - - catalog.add(child, metadata) - assert catalog.read(child, type(metadata)) == metadata - assert catalog.children(parent) == ["some_table"] - - catalog.remove(parent) - assert catalog.read(child, type(metadata)) is None - assert catalog.children(parent) is None - - def test_ls_no_entry(self, catalog): - url = "bigquery://some-project-1234/some_dataset" - assert catalog.children(url) is None - - def test_ls_one_entry(self, catalog): - parent = "bigquery://some-project-1234/some_dataset" - child = f"{parent}/some_table" - - catalog.add(child) - assert sorted(catalog.children(parent)) == ["some_table"] - - def test_ls_multiple_entries(self, catalog): - parent = "bigquery://some-project-1234/some_dataset" - child_path_one = f"{parent}/some_table_1" - child_path_two = f"{parent}/some_table_2" - child_path_three = f"{parent}/some_table_3" - catalog.add(child_path_one) - catalog.add(child_path_three) - catalog.add(child_path_two) - assert sorted(catalog.children(parent)) == sorted( - ["some_table_1", "some_table_2", "some_table_3"] - ) - - def test_search(self, catalog, metadata): - path = "bigquery://some-project-1234/some_dataset/some_table" - - catalog.add(path, metadata) - search_result = catalog.search( - "json_extract(metadata_obj, '$.\"field1\"') = 'test'", - type(metadata), - ) - - assert search_result == [metadata] diff --git a/tests/storage/test_db.py b/tests/storage/test_db.py new file mode 100644 index 00000000..6a2115c8 --- /dev/null +++ b/tests/storage/test_db.py @@ -0,0 +1,81 @@ +# pylint: disable=missing-function-docstring +# pylint: disable=missing-class-docstring +# pylint: disable=missing-module-docstring + +import pytest +from pydantic import BaseModel + +from recap.catalog import Relationship +from recap.storage import create_storage +from recap.storage.abstract import AbstractStorage + + +class TestMetadata(BaseModel): + field1: str + field2: int | None + + +class TestDatabaseStorage: + @pytest.fixture + def storage(self) -> AbstractStorage: + return create_storage("sqlite:///:memory:") + + @pytest.fixture + def metadata(self): + return TestMetadata(field1="test", field2=123) + + def test_link_doesnt_exist(self, storage: AbstractStorage): + url = "postgresql://localhost/some_db/some_schema" + assert storage.links(url, Relationship.CONTAINS.name) == [] + + def test_link_does_exist(self, storage: AbstractStorage): + parent = "postgresql://localhost/some_db/some_schema" + child = f"{parent}/some_table" + assert storage.links(parent, Relationship.CONTAINS.name) == [] + + storage.link(parent, Relationship.CONTAINS.name, child) + assert storage.links(parent, Relationship.CONTAINS.name) == [child] + + def test_link_readded(self, storage: AbstractStorage): + parent = "postgresql://localhost/some_db/some_schema" + child = f"{parent}/some_table" + assert storage.links(parent, Relationship.CONTAINS.name) == [] + + storage.link(parent, Relationship.CONTAINS.name, child) + assert storage.links(parent, Relationship.CONTAINS.name) == [child] + + storage.unlink(parent, Relationship.CONTAINS.name, child) + assert storage.links(parent, Relationship.CONTAINS.name) == [] + + storage.link(parent, Relationship.CONTAINS.name, child) + assert storage.links(parent, Relationship.CONTAINS.name) == [child] + + def test_write(self, storage: AbstractStorage, metadata: TestMetadata): + url = "bigquery://some-project-1234/some_dataset/some_table" + assert storage.metadata(url, type(metadata)) is None + + storage.write(url, metadata) + assert storage.metadata(url, type(metadata)) == metadata + + def test_ls_multiple_entries(self, storage: AbstractStorage): + parent = "bigquery://some-project-1234/some_dataset" + child_path_one = f"{parent}/some_table_1" + child_path_two = f"{parent}/some_table_2" + child_path_three = f"{parent}/some_table_3" + storage.link(parent, Relationship.CONTAINS.name, child_path_one) + storage.link(parent, Relationship.CONTAINS.name, child_path_two) + storage.link(parent, Relationship.CONTAINS.name, child_path_three) + assert sorted(storage.links(parent, Relationship.CONTAINS.name)) == sorted( + [child_path_one, child_path_two, child_path_three] + ) + + def test_search(self, storage: AbstractStorage, metadata: TestMetadata): + url = "bigquery://some-project-1234/some_dataset/some_table" + + storage.write(url, metadata) + search_result = storage.search( + "json_extract(metadata_obj, '$.\"field1\"') = 'test'", + type(metadata), + ) + + assert search_result == [metadata]