From 070007cc06bfc3772b400fdce0997f81d19dbd4c Mon Sep 17 00:00:00 2001 From: Alessio Vertemati Date: Tue, 1 Oct 2024 21:42:36 +0200 Subject: [PATCH] Document parsing --- .github/FUNDING.yml | 1 - README.md | 97 ++++++++- composer.json | 13 +- src/Connectors/ParseConnector.php | 89 +++++++++ src/DocumentFormat/DocumentNode.php | 115 +++++++++++ src/DocumentFormat/PageNode.php | 102 ++++++++++ src/DocumentProcessor.php | 20 ++ src/Dto/DocumentDto.php | 40 ++++ src/Exceptions/EmptyDocumentException.php | 10 + .../InvalidDocumentFormatException.php | 10 + src/ParseOption.php | 13 ++ src/Requests/ExtractTextRequest.php | 65 ++++++ src/Responses/ParseResponse.php | 11 ++ tests/ArchTest.php | 18 +- tests/AuthTest.php | 34 ++++ tests/ExampleTest.php | 5 - tests/Fixtures/Saloon/extract-text-empty.json | 7 + .../Saloon/extract-text-non-existing.json | 7 + .../Fixtures/Saloon/extract-text-non-pdf.json | 7 + .../extract-text-pdfact-not-available.json | 8 + .../Fixtures/Saloon/extract-text-pymupdf.json | 7 + tests/Fixtures/Saloon/extract-text.json | 7 + tests/ParseProcessorSelectionTest.php | 70 +++++++ tests/ParseTest.php | 187 ++++++++++++++++++ tests/Pest.php | 9 + tests/ValidationTest.php | 34 ++++ 26 files changed, 970 insertions(+), 16 deletions(-) delete mode 100644 .github/FUNDING.yml create mode 100644 src/Connectors/ParseConnector.php create mode 100644 src/DocumentFormat/DocumentNode.php create mode 100644 src/DocumentFormat/PageNode.php create mode 100644 src/DocumentProcessor.php create mode 100644 src/Dto/DocumentDto.php create mode 100644 src/Exceptions/EmptyDocumentException.php create mode 100644 src/Exceptions/InvalidDocumentFormatException.php create mode 100644 src/ParseOption.php create mode 100644 src/Requests/ExtractTextRequest.php create mode 100644 src/Responses/ParseResponse.php create mode 100644 tests/AuthTest.php delete mode 100644 tests/ExampleTest.php create mode 100644 tests/Fixtures/Saloon/extract-text-empty.json create mode 100644 tests/Fixtures/Saloon/extract-text-non-existing.json create mode 100644 tests/Fixtures/Saloon/extract-text-non-pdf.json create mode 100644 tests/Fixtures/Saloon/extract-text-pdfact-not-available.json create mode 100644 tests/Fixtures/Saloon/extract-text-pymupdf.json create mode 100644 tests/Fixtures/Saloon/extract-text.json create mode 100644 tests/ParseProcessorSelectionTest.php create mode 100644 tests/ParseTest.php create mode 100644 tests/ValidationTest.php diff --git a/.github/FUNDING.yml b/.github/FUNDING.yml deleted file mode 100644 index d2995c5..0000000 --- a/.github/FUNDING.yml +++ /dev/null @@ -1 +0,0 @@ -github: OneOffTech diff --git a/README.md b/README.md index 54444cd..2e0de13 100644 --- a/README.md +++ b/README.md @@ -4,7 +4,9 @@ [![Tests](https://img.shields.io/github/actions/workflow/status/oneofftech/oneofftech-parse-client/run-tests.yml?branch=main&label=tests&style=flat-square)](https://github.com/oneofftech/oneofftech-parse-client/actions/workflows/run-tests.yml) [![Total Downloads](https://img.shields.io/packagist/dt/oneofftech/oneofftech-parse-client.svg?style=flat-square)](https://packagist.org/packages/oneofftech/oneofftech-parse-client) -Parse client is a library to interact with OneOffTech PDF Parsing service based on [PDFAct](https://github.com/data-house/pdfact). OneOffTech Parse is designed to extract text from PDF files maintaining the structure of the document to improve interaction with Large Language Models (LLMs). +Parse client is a library to interact with [OneOffTech Parse](https://parse.oneofftech.de) service. OneOffTech Parse is designed to extract text from PDF files preserving the [structure of the document](#document-structure) to improve interaction with Large Language Models (LLMs). + +OneOffTech Parse is based on [PDF Text extractor](https://github.com/data-house/pdf-text-extractor). The client is suitable to connect to self-hosted versions of the [PDF Text extractor](https://github.com/data-house/pdf-text-extractor). > [!INFO] @@ -13,7 +15,7 @@ Parse client is a library to interact with OneOffTech PDF Parsing service based ## Installation -You can install the package via composer: +You can install the package via Composer: ```bash composer require oneofftech/parse-client @@ -21,14 +23,101 @@ composer require oneofftech/parse-client ## Usage +The Parse client is able to connect to self-hosted instances of the [PDF Text extractor](https://github.com/data-house/pdf-text-extractor) service or the cloud hosted [OneOffTech Parse](https://parse.oneofftech.de) service. + +### Use with self-hosted instance + +Before proceeding a running instance of the [PDF Text extractor](https://github.com/data-house/pdf-text-extractor) is required. Once you have a running instance create an instance of the connector client passing the url on which your instance is listening. + +```php +use OneOffTech\Parse\Client\Connectors\ParseConnector; + +$client = new ParseConnector(baseUrl: "http://localhost:5000"); + +/** @var \OneOffTech\Parse\Client\Dto\DocumentDto */ +$document = $client->parse("https://domain.internal/document.pdf"); +``` + +> [!INFO] +> - The URL of the document must be accessible without authentication. +> - Documents are downloaded for the time of processing and then the file is immediately deleted. + + +### Use the cloud hosted service + +Go to [parse.oneofftech.de](https://parse.oneofftech.de) and obtain an access token. Instantiate the client and provide a URL of a PDF document. + +```php +use OneOffTech\Parse\Client\Connectors\ParseConnector; + +$client = new ParseConnector("token"); + +/** @var \OneOffTech\Parse\Client\Dto\DocumentDto */ +$document = $client->parse("https://domain.internal/document.pdf"); +``` + +> [!INFO] +> - The URL of the document must be accessible without authentication. +> - Documents are downloaded for the time of processing and then the file is immediately deleted. + + +### Specify the preferred extraction method + +Parse service supports different processors, [`pymupdf`](https://github.com/pymupdf/PyMuPDF) or [`pdfact`](https://github.com/data-house/pdfact). You can specify the preferred processor for each request. + ```php -... +use OneOffTech\Parse\Client\ParseOption; +use OneOffTech\Parse\Client\DocumentProcessor; +use OneOffTech\Parse\Client\Connectors\ParseConnector; + +$client = new ParseConnector("token"); + +/** @var \OneOffTech\Parse\Client\Dto\DocumentDto */ +$document = $client->parse( + url: "https://domain.internal/document.pdf", + options: new ParseOption(DocumentProcessor::PYMUPDF) +); +``` + +### PDFAct vs PyMuPDF + +PDFAct offers more flexibility than PyMuPDF. You should evaluate the extraction method best suitable for your application. Here is a small comparison of the two methods. + +| feature | PDFAct | PyMuPDF | +|-----------------------------------|--------|---------| +| Text extraction | :white_check_mark: | :white_check_mark: | +| Pagination | :white_check_mark: | :white_check_mark: | +| Headings identification | :white_check_mark: | - | +| Text styles (e.g. bold or italic) | :white_check_mark: | - | +| Page header | :white_check_mark: | - | +| Page footer | :white_check_mark: | - | + + + + +## Document structure + +Parse is designed to preserve the document's structure hence the content is returned in a hierarchical fashion. + +``` +Document + ├─Page + │ ├─Text (category: heading) + │ └─Text (category: body) + └─Page + ├─Text (category: heading) + └─Text (category: body) ``` +For a more in-depth explanation of the structure see [Parse Document Model](https://github.com/OneOffTech/parse-document-model-python). + + ## Testing Parse client is tested using [PEST](https://pestphp.com/). Tests run for each commit and pull request. +To execute the test suite run: + ```bash composer test ``` @@ -39,7 +128,7 @@ Please see [CHANGELOG](CHANGELOG.md) for more information on what has changed re ## Contributing -Thank you for considering contributing to the Librarian client! The contribution guide can be found in the [CONTRIBUTING.md](./.github/CONTRIBUTING.md) file. +Thank you for considering contributing to the Parse client! The contribution guide can be found in the [CONTRIBUTING.md](./.github/CONTRIBUTING.md) file. ## Security Vulnerabilities diff --git a/composer.json b/composer.json index bc683e6..f1ba6e4 100644 --- a/composer.json +++ b/composer.json @@ -1,9 +1,11 @@ { "name": "oneofftech/parse-client", - "description": "This is my package oneofftech-parse-client", + "description": "Parse PDF document keeping the structure.", "keywords": [ - "OneOffTech", - "oneofftech-parse-client" + "pdf", + "parse", + "parsing", + "text-extract" ], "homepage": "https://github.com/oneofftech/oneofftech-parse-client", "license": "MIT", @@ -19,8 +21,9 @@ "saloonphp/saloon": "^3.10" }, "require-dev": { - "pestphp/pest": "^2.20", - "laravel/pint": "^1.0" + "jonpurvis/lawman": "^1.2", + "laravel/pint": "^1.0", + "pestphp/pest": "^2.20" }, "autoload": { "psr-4": { diff --git a/src/Connectors/ParseConnector.php b/src/Connectors/ParseConnector.php new file mode 100644 index 0000000..0cdd648 --- /dev/null +++ b/src/Connectors/ParseConnector.php @@ -0,0 +1,89 @@ +baseUrl; + } + + protected function defaultAuth(): Authenticator + { + if (is_null($this->token)) { + return new NullAuthenticator; + } + + return new TokenAuthenticator($this->token); + } + + /** + * Determine if the request has failed. + */ + public function hasRequestFailed(Response $response): ?bool + { + return $response->serverError() || $response->clientError(); + } + + // Resources and helper methods + + /** + * Parse a document hosted on a web server + * + * @param string $url The URL under which the document is accessible + * @param string $mimeType The mime type of the document. Default application/pdf + * @param \OneOffTech\Parse\Client\ParseOption $options Specifiy additional options for the specific parsing processor + */ + public function parse(string $url, string $mimeType = 'application/pdf', ?ParseOption $options = null): DocumentDto + { + return $this + ->send((new ExtractTextRequest( + url: $url, + mimeType: $mimeType, + preferredDocumentProcessor: $options?->processor?->value ?? DocumentProcessor::PDFACT->value, + ))->validate()) + ->dto(); + } +} diff --git a/src/DocumentFormat/DocumentNode.php b/src/DocumentFormat/DocumentNode.php new file mode 100644 index 0000000..cf191ea --- /dev/null +++ b/src/DocumentFormat/DocumentNode.php @@ -0,0 +1,115 @@ +content); + } + + /** + * Test if the document is empty, i.e. contains no pages or has no textual content on any of the pages + */ + public function isEmpty(): bool + { + return $this->count() === 0 || !$this->hasContent(); + } + + /** + * Test if the document has discernible textual content on any of the pages + */ + public function hasContent(): bool + { + foreach (new RecursiveIteratorIterator(new RecursiveArrayIterator($this->content), RecursiveIteratorIterator::LEAVES_ONLY) as $key => $value) { + if($key === 'text' && !empty($value)){ + return true; + } + } + + return false; + } + + + /** + * The pages in this document + * + * @return \OneOffTech\Parse\Client\DocumentFormat\PageNode[] + */ + public function pages(): array + { + return array_map(fn($page) => PageNode::fromArray($page), $this->content); + } + + public function text(): string + { + $text = []; + + foreach (new RecursiveIteratorIterator(new RecursiveArrayIterator($this->content), RecursiveIteratorIterator::LEAVES_ONLY) as $key => $value) { + if($key === 'text' && !empty($value)){ + $text[] = $value; + } + } + + return join(PHP_EOL, $text); + } + + + /** + * Throw exception if document has no textual content + * + * @throws OneOffTech\Parse\Client\Exceptions\EmptyDocumentException when document has no textual content + */ + public function throwIfNoContent(): self + { + if(!$this->hasContent()){ + throw new EmptyDocumentException("Document has no textual content."); + } + + return $this; + } + + + /** + * Create a document node from associative array + */ + public static function fromArray(array $data): DocumentNode + { + if(!(isset($data['category']) && isset($data['content']))){ + throw new InvalidDocumentFormatException("Unexpected document structure. Missing category or content."); + } + + if($data['category'] !== 'doc'){ + throw new InvalidDocumentFormatException("Unexpected node category. Expecting [doc] found [{$data['category']}]."); + } + + if(!is_array($data['content'])){ + throw new InvalidDocumentFormatException("Unexpected content format. Expecting [array]."); + } + + return new DocumentNode($data['content'] ?? [], $data['attributes'] ?? []); + } +} diff --git a/src/DocumentFormat/PageNode.php b/src/DocumentFormat/PageNode.php new file mode 100644 index 0000000..b3e2deb --- /dev/null +++ b/src/DocumentFormat/PageNode.php @@ -0,0 +1,102 @@ +content); + } + + /** + * Test if the page is empty, i.e. contains no textual content + */ + public function isEmpty(): bool + { + return $this->count() === 0 || !$this->hasContent(); + } + + /** + * Test if the page has discernible textual content + */ + public function hasContent(): bool + { + foreach (new RecursiveIteratorIterator(new RecursiveArrayIterator($this->content), RecursiveIteratorIterator::LEAVES_ONLY) as $key => $value) { + if(($key === 'text' || $key === 'content') && !empty($value)){ + return true; + } + } + + return false; + } + + + /** + * The elements in this page + */ + public function items(): array + { + return $this->content; + } + + public function text(): string + { + $text = []; + + foreach (new RecursiveIteratorIterator(new RecursiveArrayIterator($this->content), RecursiveIteratorIterator::LEAVES_ONLY) as $key => $value) { + if(($key === 'text' || $key === 'content') && !empty($value)){ + $text[] = $value; + } + } + + return join(PHP_EOL, $text); + } + + public function number(): int + { + return (int)$this->attributes['page'] ?? 1; + } + + + /** + * Create a page node from associative array + */ + public static function fromArray(array $data): PageNode + { + if(!(isset($data['category']) && isset($data['content']))){ + throw new InvalidDocumentFormatException("Unexpected document structure. Missing category or content."); + } + + if($data['category'] !== 'page'){ + throw new InvalidDocumentFormatException("Unexpected node category. Expecting [doc] found [{$data['category']}]."); + } + + if(!is_array($data['content'])){ + throw new InvalidDocumentFormatException("Unexpected content format. Expecting [array]."); + } + + return new PageNode($data['content'] ?? [], $data['attributes'] ?? []); + } +} diff --git a/src/DocumentProcessor.php b/src/DocumentProcessor.php new file mode 100644 index 0000000..7b33a12 --- /dev/null +++ b/src/DocumentProcessor.php @@ -0,0 +1,20 @@ +raw = DocumentNode::fromArray($data); + } + + public function pages(): array + { + return $this->raw->pages(); + } + + /** + * The number of pages in this document + */ + public function count(): int + { + return $this->raw->count(); + } + + /** + * Get the underlying document node + */ + public function document(): DocumentNode + { + return $this->raw; + } +} diff --git a/src/Exceptions/EmptyDocumentException.php b/src/Exceptions/EmptyDocumentException.php new file mode 100644 index 0000000..ed887c1 --- /dev/null +++ b/src/Exceptions/EmptyDocumentException.php @@ -0,0 +1,10 @@ + $this->url, + 'mime_type' => $this->mimeType, + 'driver' => $this->preferredDocumentProcessor ?? 'pdfact', + ]; + } + + public function validate(): self + { + if (empty(trim($this->url))) { + throw new InvalidArgumentException('The [url] is required to be non-empty.'); + } + + if (empty(trim($this->mimeType))) { + throw new InvalidArgumentException('The [mime type] is required to be non-empty.'); + } + + return $this; + } + + public function createDtoFromResponse(Response $response): DocumentDto + { + $data = $response->json(); + + return (new DocumentDto($data))->setResponse($response); + } +} diff --git a/src/Responses/ParseResponse.php b/src/Responses/ParseResponse.php new file mode 100644 index 0000000..e97d8e3 --- /dev/null +++ b/src/Responses/ParseResponse.php @@ -0,0 +1,11 @@ +expect(['dd', 'dump', 'ray']) + ->expect(['dd', 'dump', 'ray', 'var_dump', 'var_export']) ->not->toBeUsed(); + +test('ParseConnector is a Saloon connector') + ->expect(ParseConnector::class) + ->toBeSaloonConnector() + ->toUseAcceptsJsonTrait() + ->toUseTokenAuthentication() + ->toUseAlwaysThrowOnErrorsTrait(); + +test('ExtractTextRequest is a Saloon request') + ->expect(ExtractTextRequest::class) + ->toBeSaloonRequest() + ->toSendPostRequest() + ->toHaveJsonBody(); diff --git a/tests/AuthTest.php b/tests/AuthTest.php new file mode 100644 index 0000000..bffb9ce --- /dev/null +++ b/tests/AuthTest.php @@ -0,0 +1,34 @@ + MockResponse::fixture('extract-text'), + ]); + + $connector = new ParseConnector; + + $connector->withMockClient($mockClient); + + expect($connector->getAuthenticator()) + ->toBeInstanceOf(NullAuthenticator::class); +}); + +test('ensure connector uses token authentication', function () { + $mockClient = MockClient::global([ + ExtractTextRequest::class => MockResponse::fixture('extract-text'), + ]); + + $connector = new ParseConnector('token'); + + $connector->withMockClient($mockClient); + + expect($connector->getAuthenticator()) + ->toBeInstanceOf(TokenAuthenticator::class); +}); diff --git a/tests/ExampleTest.php b/tests/ExampleTest.php deleted file mode 100644 index 5d36321..0000000 --- a/tests/ExampleTest.php +++ /dev/null @@ -1,5 +0,0 @@ -toBeTrue(); -}); diff --git a/tests/Fixtures/Saloon/extract-text-empty.json b/tests/Fixtures/Saloon/extract-text-empty.json new file mode 100644 index 0000000..965d589 --- /dev/null +++ b/tests/Fixtures/Saloon/extract-text-empty.json @@ -0,0 +1,7 @@ +{ + "statusCode": 200, + "headers": { + "content-type": "application\/json" + }, + "data": "{\"category\":\"doc\", \"attributes\": null, \"content\":[{\"category\":\"page\",\"attributes\":{\"page\":1},\"content\":[]}]}" +} \ No newline at end of file diff --git a/tests/Fixtures/Saloon/extract-text-non-existing.json b/tests/Fixtures/Saloon/extract-text-non-existing.json new file mode 100644 index 0000000..de00beb --- /dev/null +++ b/tests/Fixtures/Saloon/extract-text-non-existing.json @@ -0,0 +1,7 @@ +{ + "statusCode": 422, + "headers": { + "content-type": "application\/json" + }, + "data": "{\"detail\":\"File not found at given url.\"}" +} \ No newline at end of file diff --git a/tests/Fixtures/Saloon/extract-text-non-pdf.json b/tests/Fixtures/Saloon/extract-text-non-pdf.json new file mode 100644 index 0000000..730535a --- /dev/null +++ b/tests/Fixtures/Saloon/extract-text-non-pdf.json @@ -0,0 +1,7 @@ +{ + "statusCode": 422, + "headers": { + "content-type": "application\/json" + }, + "data": "{\"detail\":\"The given file is not supported. Expected [application/pdf] found [application/vnd.openxmlformats-officedocument.wordprocessingml.document].\"}" +} \ No newline at end of file diff --git a/tests/Fixtures/Saloon/extract-text-pdfact-not-available.json b/tests/Fixtures/Saloon/extract-text-pdfact-not-available.json new file mode 100644 index 0000000..c004574 --- /dev/null +++ b/tests/Fixtures/Saloon/extract-text-pdfact-not-available.json @@ -0,0 +1,8 @@ +{ + "statusCode": 503, + "headers": { + "content-length": "48", + "content-type": "application\/json" + }, + "data": "{\"detail\":\"The pdfact service is not reachable.\"}" +} \ No newline at end of file diff --git a/tests/Fixtures/Saloon/extract-text-pymupdf.json b/tests/Fixtures/Saloon/extract-text-pymupdf.json new file mode 100644 index 0000000..ab33446 --- /dev/null +++ b/tests/Fixtures/Saloon/extract-text-pymupdf.json @@ -0,0 +1,7 @@ +{ + "statusCode": 200, + "headers": { + "content-type": "application\/json" + }, + "data": "{\"category\":\"doc\", \"attributes\": null,\"content\":[{\"category\":\"page\",\"attributes\":{\"page\":1},\"content\":[{\"role\":\"body\",\"text\":\"Type of document \/ Offer \/ Contract \/ Report This is the title of the document, it can use multiple lines and grow a bit Subtitle of the document OneOff-Tech\",\"marks\":[],\"attributes\":{\"bounding_box\":[]}}]},{\"category\":\"page\",\"attributes\":{\"page\":2},\"content\":[{\"role\":\"body\",\"text\":\"Section Heading 1 First chapter This is an example text.\",\"marks\":[],\"attributes\":{\"bounding_box\":[]}}]}]}" +} \ No newline at end of file diff --git a/tests/Fixtures/Saloon/extract-text.json b/tests/Fixtures/Saloon/extract-text.json new file mode 100644 index 0000000..68a53f0 --- /dev/null +++ b/tests/Fixtures/Saloon/extract-text.json @@ -0,0 +1,7 @@ +{ + "statusCode": 200, + "headers": { + "content-type": "application\/json" + }, + "data": "{\"category\": \"doc\", \"attributes\": null, \"content\": [{\"category\": \"page\",\"attributes\": {\"page\": 1},\"content\": [{\"role\": \"page-header\",\"text\": \"Type of document / Offer / Contract / Report\",\"marks\": [{\"category\": \"textStyle\",\"color\": {\"r\": 78,\"b\": 189,\"g\": 128,\"id\": \"color-1\"},\"font\": {\"name\": \"fira sans\",\"id\": \"font-300\",\"size\": 18}}],\"attributes\": {\"bounding_box\": [{\"min_x\": 62.1,\"min_y\": 565.0,\"max_x\": 427.2,\"max_y\": 577.6,\"page\": 1}]}},{\"role\": \"title\",\"text\": \"This is the title of the document, it\",\"marks\": [{\"category\": \"textStyle\",\"color\": {\"r\": 0,\"b\": 0,\"g\": 0,\"id\": \"color-0\"},\"font\": {\"name\": \"fira sans\",\"id\": \"font-300\",\"size\": 30}}],\"attributes\": {\"bounding_box\": [{\"min_x\": 62.1,\"min_y\": 532.1,\"max_x\": 514.6,\"max_y\": 554.7,\"page\": 1}]}},{\"role\": \"heading\",\"text\": \"can use multiple lines and grow a bit\",\"marks\": [{\"category\": \"textStyle\",\"color\": {\"r\": 0,\"b\": 0,\"g\": 0,\"id\": \"color-0\"},\"font\": {\"name\": \"fira sans\",\"id\": \"font-300\",\"size\": 30}}],\"attributes\": {\"bounding_box\": [{\"min_x\": 62.1,\"min_y\": 496.1,\"max_x\": 503.0,\"max_y\": 518.7,\"page\": 1},{\"min_x\": 62.1,\"min_y\": 460.1,\"max_x\": 98.7,\"max_y\": 482.7,\"page\": 1}]}},{\"role\": \"heading\",\"text\": \"Subtitle of the document\",\"marks\": [{\"category\": \"textStyle\",\"color\": {\"r\": 247,\"b\": 70,\"g\": 150,\"id\": \"color-2\"},\"font\": {\"name\": \"fira sans\",\"id\": \"font-300\",\"size\": 22}}],\"attributes\": {\"bounding_box\": [{\"min_x\": 62.1,\"min_y\": 431.6,\"max_x\": 296.5,\"max_y\": 447.1,\"page\": 1}]}},{\"role\": \"body\",\"text\": \"OneOff-Tech UG\",\"marks\": [{\"category\": \"textStyle\",\"color\": {\"r\": 0,\"b\": 0,\"g\": 0,\"id\": \"color-0\"},\"font\": {\"name\": \"fira sans-bold\",\"id\": \"font-301\",\"size\": 11}},{\"category\": \"bold\"}],\"attributes\": {\"bounding_box\": [{\"min_x\": 62.1,\"min_y\": 209.0,\"max_x\": 253.5,\"max_y\": 217.5,\"page\": 1}]}}]},{\"category\": \"page\",\"attributes\": {\"page\": 2},\"content\": [{\"role\": \"heading\",\"text\": \"1 First chapter\",\"marks\": [{\"category\": \"textStyle\",\"color\": {\"r\": 53,\"b\": 145,\"g\": 96,\"id\": \"color-4\"},\"font\": {\"name\": \"fira sans\",\"id\": \"font-300\",\"size\": 20}}],\"attributes\": {\"bounding_box\": [{\"min_x\": 56.7,\"min_y\": 702.8,\"max_x\": 193.9,\"max_y\": 717.8,\"page\": 2}]}},{\"role\": \"body\",\"text\": \"This is an example text.\",\"marks\": [{\"category\": \"textStyle\",\"color\": {\"r\": 0,\"b\": 0,\"g\": 0,\"id\": \"color-0\"},\"font\": {\"name\": \"fira sans\",\"id\": \"font-300\",\"size\": 11}}],\"attributes\": {\"bounding_box\": [{\"min_x\": 56.7,\"min_y\": 665.0,\"max_x\": 504.2,\"max_y\": 687.3,\"page\": 2}]}}]}]}" +} \ No newline at end of file diff --git a/tests/ParseProcessorSelectionTest.php b/tests/ParseProcessorSelectionTest.php new file mode 100644 index 0000000..28777e3 --- /dev/null +++ b/tests/ParseProcessorSelectionTest.php @@ -0,0 +1,70 @@ + MockResponse::fixture('extract-text-empty'), + ]); + + $connector = new ParseConnector('fake', 'http://localhost:5002'); + $connector->withMockClient($mockClient); + + $connector->parse('http://localhost/empty.pdf'); + + $mockClient->assertSent(ExtractTextRequest::class); + + $mockClient->assertSent(function (Request $request, Response $response) { + if (! $request instanceof ExtractTextRequest) { + return false; + } + + /** @var array */ + $body = $request->body()->all(); + + return $body['url'] === 'http://localhost/empty.pdf' + && $body['mime_type'] === 'application/pdf' + && $body['driver'] === 'pdfact'; + }); + + $mockClient->assertSentCount(1); + +}); + +test('ensure pymupdf is selected as processor', function () { + $mockClient = MockClient::global([ + ExtractTextRequest::class => MockResponse::fixture('extract-text-empty'), + ]); + + $connector = new ParseConnector('fake', 'http://localhost:5002'); + $connector->withMockClient($mockClient); + + $connector->parse( + url: 'http://localhost/empty.pdf', + options: new ParseOption(DocumentProcessor::PYMUPDF), + ); + + $mockClient->assertSent(ExtractTextRequest::class); + + $mockClient->assertSent(function (Request $request, Response $response) { + if (! $request instanceof ExtractTextRequest) { + return false; + } + + /** @var array */ + $body = $request->body()->all(); + + return $body['url'] === 'http://localhost/empty.pdf' + && $body['mime_type'] === 'application/pdf' + && $body['driver'] === 'pymupdf'; + }); + + $mockClient->assertSentCount(1); +}); diff --git a/tests/ParseTest.php b/tests/ParseTest.php new file mode 100644 index 0000000..f03beef --- /dev/null +++ b/tests/ParseTest.php @@ -0,0 +1,187 @@ + MockResponse::fixture('extract-text'), + ]); + + $connector = new ParseConnector('fake', 'http://localhost:5002'); + $connector->withMockClient($mockClient); + + $document = $connector->parse('http://localhost/base.pdf', 'application/pdf'); + + expect($document) + ->toBeInstanceOf(DocumentDto::class) + ->toHaveCount(2); + + expect($document->document()->isEmpty()) + ->toBeFalse(); + + expect($document->document()->hasContent()) + ->toBeTrue(); + + expect($document->document()->text()) + ->toBeString()->toContain('This is the title of the document'); + + $pages = $document->pages(); + + expect($pages) + ->toHaveCount(2); + + expect($pages[0]->hasContent()) + ->toBeTrue(); + + expect($pages[0]->text()) + ->toBeString() + ->toEqual('Type of document / Offer / Contract / Report'.PHP_EOL.'This is the title of the document, it'.PHP_EOL.'can use multiple lines and grow a bit'.PHP_EOL.'Subtitle of the document'.PHP_EOL.'OneOff-Tech UG'); + + expect($pages[0]->number()) + ->toEqual(1); + + expect($pages[1]->number()) + ->toEqual(2); + +}); + +test('can parse a pdf using pymupdf', function () { + $mockClient = MockClient::global([ + ExtractTextRequest::class => MockResponse::fixture('extract-text-pymupdf'), + ]); + + $connector = new ParseConnector('fake', 'http://localhost:5002'); + $connector->withMockClient($mockClient); + + $document = $connector->parse('http://localhost/base.pdf', 'application/pdf', new ParseOption(DocumentProcessor::PYMUPDF)); + + expect($document) + ->toBeInstanceOf(DocumentDto::class) + ->toHaveCount(2); + + expect($document->document()->isEmpty()) + ->toBeFalse(); + + expect($document->document()->hasContent()) + ->toBeTrue(); + + expect($document->pages()) + ->toHaveCount(2); + + expect($document->document()->text()) + ->toBeString()->toContain('This is the title of the document'); + +}); + +test('can parse an empty pdf', function () { + $mockClient = MockClient::global([ + ExtractTextRequest::class => MockResponse::fixture('extract-text-empty'), + ]); + + $connector = new ParseConnector('fake', 'http://localhost:5002'); + $connector->withMockClient(MockClient::getGlobal()); + + $document = $connector->parse('http://localhost/empty.pdf', 'application/pdf'); + + expect($document) + ->toBeInstanceOf(DocumentDto::class) + ->toHaveCount(1); + + expect($document->document()->isEmpty()) + ->toBeTrue(); + + expect($document->document()->hasContent()) + ->toBeFalse(); + + $mockClient->assertSent(ExtractTextRequest::class); + + $mockClient->assertSent(function (Request $request, Response $response) { + if (! $request instanceof ExtractTextRequest) { + return false; + } + + /** @var array */ + $body = $request->body()->all(); + + return $body['url'] === 'http://localhost/empty.pdf' && $body['mime_type'] === 'application/pdf'; + }); + + $mockClient->assertSentCount(1); + +}); + +test('cannot parse file types other than pdf', function () { + $mockClient = MockClient::global([ + ExtractTextRequest::class => MockResponse::fixture('extract-text-non-pdf'), + ]); + + $connector = new ParseConnector('fake', 'http://localhost:5002'); + $connector->withMockClient(MockClient::getGlobal()); + + $connector->parse('http://localhost/base.docx'); + + $mockClient->assertSent(ExtractTextRequest::class); + + $mockClient->assertSent(function (Request $request, Response $response) { + if (! $request instanceof ExtractTextRequest) { + return false; + } + + /** @var array */ + $body = $request->body()->all(); + + return $body['url'] === 'http://localhost/base.docx' && $body['mime_type'] === 'application/pdf'; + }); + + $mockClient->assertSentCount(1); + +})->throws(UnprocessableEntityException::class, 'The given file is not supported. Expected [application/pdf] found [application/vnd.openxmlformats-officedocument.wordprocessingml.document].'); + +test('handle non existing files', function () { + $mockClient = MockClient::global([ + ExtractTextRequest::class => MockResponse::fixture('extract-text-non-existing'), + ]); + + $connector = new ParseConnector('fake', 'http://localhost:5002'); + $connector->withMockClient(MockClient::getGlobal()); + + $connector->parse('http://localhost/test.pdf', 'application/pdf'); + + $mockClient->assertSent(ExtractTextRequest::class); + + $mockClient->assertSent(function (Request $request, Response $response) { + if (! $request instanceof ExtractTextRequest) { + return false; + } + + /** @var array */ + $body = $request->body()->all(); + + return $body['url'] === 'http://localhost/test.pdf' && $body['mime_type'] === 'application/pdf'; + }); + + $mockClient->assertSentCount(1); + +})->throws(UnprocessableEntityException::class, 'File not found at given url.'); + +test('handle pdfact not available', function () { + $mockClient = MockClient::global([ + ExtractTextRequest::class => MockResponse::fixture('extract-text-pdfact-not-available'), + ]); + + $connector = new ParseConnector('fake', 'http://localhost:5002'); + $connector->withMockClient($mockClient); + + $connector->parse('http://localhost/km-f.pdf', 'application/pdf'); + +})->throws(ServiceUnavailableException::class, 'The pdfact service is not reachable.'); diff --git a/tests/Pest.php b/tests/Pest.php index b3d9bbc..3756d1e 100644 --- a/tests/Pest.php +++ b/tests/Pest.php @@ -1 +1,10 @@ beforeEach(fn () => MockClient::destroyGlobal()) + ->in(__DIR__); diff --git a/tests/ValidationTest.php b/tests/ValidationTest.php new file mode 100644 index 0000000..c526d7d --- /dev/null +++ b/tests/ValidationTest.php @@ -0,0 +1,34 @@ + MockResponse::fixture('extract-text-invalid-param'), + ]); + + $connector = new ParseConnector('fake', 'http://localhost:5002'); + $connector->withMockClient(MockClient::getGlobal()); + + $connector->parse('', 'application/pdf'); + + $mockClient->assertNothingSent(); + +})->throws(InvalidArgumentException::class, 'The [url] is required to be non-empty.'); + +test('mime type required to be non-null', function () { + $mockClient = MockClient::global([ + ExtractTextRequest::class => MockResponse::fixture('extract-text-invalid-param'), + ]); + + $connector = new ParseConnector('fake', 'http://localhost:5002'); + $connector->withMockClient(MockClient::getGlobal()); + + $connector->parse('http://localhost/test.pdf', ''); + + $mockClient->assertNothingSent(); + +})->throws(InvalidArgumentException::class, 'The [mime type] is required to be non-empty.');