Skip to content

Commit

Permalink
Added changes for exposing whisper hash (#53)
Browse files Browse the repository at this point in the history
* Added changes for exposing whisper hash

* Added changes for exposing whisper hash

* Sending additionla params to LLM whiper side to enable highlighting

* Updated the return type for all the x2text adapters

* Updated constants

* Updated to use the dta class

* Updated to use the dto class

* Corrected the dto class

---------

Co-authored-by: Gayathri <142381512+gaya3-zipstack@users.noreply.github.com>
  • Loading branch information
johnyrahul and gaya3-zipstack authored Jun 27, 2024
1 parent c3b1e70 commit cee091c
Show file tree
Hide file tree
Showing 8 changed files with 86 additions and 31 deletions.
3 changes: 3 additions & 0 deletions src/unstract/adapters/x2text/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,3 +2,6 @@ class X2TextConstants:
PLATFORM_SERVICE_API_KEY = "PLATFORM_SERVICE_API_KEY"
X2TEXT_HOST = "X2TEXT_HOST"
X2TEXT_PORT = "X2TEXT_PORT"
ENABLE_HIGHLIGHT = "enable_highlight"
EXTRACTED_TEXT = "extracted_text"
WHISPER_HASH = "whisper-hash"
13 changes: 13 additions & 0 deletions src/unstract/adapters/x2text/dto.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
from dataclasses import dataclass
from typing import Optional


@dataclass
class TextExtractionMetadata:
whisper_hash: str


@dataclass
class TextExtractionResult:
extracted_text: str
extraction_metadata: Optional[TextExtractionMetadata] = None
6 changes: 4 additions & 2 deletions src/unstract/adapters/x2text/llama_parse/src/llama_parse.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@

from unstract.adapters.exceptions import AdapterError
from unstract.adapters.utils import AdapterUtils
from unstract.adapters.x2text.dto import TextExtractionResult
from unstract.adapters.x2text.llama_parse.src.constants import LlamaParseConfig
from unstract.adapters.x2text.x2text_adapter import X2TextAdapter

Expand Down Expand Up @@ -91,13 +92,14 @@ def process(
input_file_path: str,
output_file_path: Optional[str] = None,
**kwargs: dict[Any, Any],
) -> str:
) -> TextExtractionResult:

response_text = self._call_parser(input_file_path=input_file_path)
if output_file_path:
with open(output_file_path, "w", encoding="utf-8") as f:
f.write(response_text)
return response_text

return TextExtractionResult(extracted_text=response_text)

def test_connection(self) -> bool:
self._call_parser(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,7 @@ class WhispererConfig:
FORCE_TEXT_PROCESSING = "force_text_processing"
LINE_SPLITTER_TOLERANCE = "line_splitter_tolerance"
HORIZONTAL_STRETCH_FACTOR = "horizontal_stretch_factor"
STORE_METADATA_FOR_HIGHLIGHTING = "store_metadata_for_highlighting"


class WhisperStatus:
Expand Down
67 changes: 48 additions & 19 deletions src/unstract/adapters/x2text/llm_whisperer/src/llm_whisperer.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,8 @@

from unstract.adapters.exceptions import ExtractorError
from unstract.adapters.utils import AdapterUtils
from unstract.adapters.x2text.constants import X2TextConstants
from unstract.adapters.x2text.dto import TextExtractionMetadata, TextExtractionResult
from unstract.adapters.x2text.llm_whisperer.src.constants import (
HTTPMethod,
OutputModes,
Expand Down Expand Up @@ -126,7 +128,7 @@ def _make_request(
raise ExtractorError(msg)
return response

def _get_whisper_params(self) -> dict[str, Any]:
def _get_whisper_params(self, enable_highlight: bool = False) -> dict[str, Any]:
"""Gets query params meant for /whisper endpoint.
The params is filled based on the configuration passed.
Expand Down Expand Up @@ -167,6 +169,11 @@ def _get_whisper_params(self) -> dict[str, Any]:
),
}
)

if enable_highlight:
params.update(
{WhispererConfig.STORE_METADATA_FOR_HIGHLIGHTING: enable_highlight}
)
return params

def test_connection(self) -> bool:
Expand Down Expand Up @@ -267,26 +274,12 @@ def _extract_async(self, whisper_hash: str) -> str:
f"{retrieve_response.status_code} - {retrieve_response.text}"
)

def process(
self,
input_file_path: str,
output_file_path: Optional[str] = None,
**kwargs: dict[Any, Any],
) -> str:
"""Used to extract text from documents.
Args:
input_file_path (str): Path to file that needs to be extracted
output_file_path (Optional[str], optional): File path to write
extracted text into, if None doesn't write to a file.
Defaults to None.
Returns:
str: Extracted text
"""
def _send_whisper_request(
self, input_file_path: str, enable_highlight: bool = False
) -> requests.Response:
headers = self._get_request_headers()
headers["Content-Type"] = "application/octet-stream"
params = self._get_whisper_params()
params = self._get_whisper_params(enable_highlight)

response: requests.Response
try:
Expand All @@ -301,6 +294,11 @@ def process(
except OSError as e:
logger.error(f"OS error while reading {input_file_path}: {e}")
raise ExtractorError(str(e))
return response

def _extract_text_from_response(
self, output_file_path: Optional[str], response: requests.Response
) -> str:

output = ""
if response.status_code == 200:
Expand All @@ -320,3 +318,34 @@ def process(
logger.error(f"OS error while writing {output_file_path}: {e} ")
raise ExtractorError(str(e))
return output

def process(
self,
input_file_path: str,
output_file_path: Optional[str] = None,
**kwargs: dict[Any, Any],
) -> TextExtractionResult:
"""Used to extract text from documents.
Args:
input_file_path (str): Path to file that needs to be extracted
output_file_path (Optional[str], optional): File path to write
extracted text into, if None doesn't write to a file.
Defaults to None.
Returns:
str: Extracted text
"""

response: requests.Response = self._send_whisper_request(
input_file_path, bool(kwargs.get(X2TextConstants.ENABLE_HIGHLIGHT, False))
)

metadata = TextExtractionMetadata(
whisper_hash=response.headers.get(X2TextConstants.WHISPER_HASH, "")
)

return TextExtractionResult(
extracted_text=self._extract_text_from_response(output_file_path, response),
extraction_metadata=metadata,
)
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
import os
from typing import Any, Optional

from unstract.adapters.x2text.dto import TextExtractionResult
from unstract.adapters.x2text.helper import UnstructuredHelper
from unstract.adapters.x2text.x2text_adapter import X2TextAdapter

Expand Down Expand Up @@ -41,11 +42,12 @@ def process(
input_file_path: str,
output_file_path: Optional[str] = None,
**kwargs: dict[Any, Any],
) -> str:
output: str = UnstructuredHelper.process_document(
) -> TextExtractionResult:
extracted_text: str = UnstructuredHelper.process_document(
self.config, input_file_path, output_file_path
)
return output

return TextExtractionResult(extracted_text=extracted_text)

def test_connection(self) -> bool:
result: bool = UnstructuredHelper.test_server_connection(self.config)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
import os
from typing import Any, Optional

from unstract.adapters.x2text.dto import TextExtractionResult
from unstract.adapters.x2text.helper import UnstructuredHelper
from unstract.adapters.x2text.x2text_adapter import X2TextAdapter

Expand Down Expand Up @@ -40,12 +41,13 @@ def process(
self,
input_file_path: str,
output_file_path: Optional[str] = None,
**kwargs: dict[Any, Any],
) -> str:
output: str = UnstructuredHelper.process_document(
**kwargs: dict[str, Any],
) -> TextExtractionResult:
extracted_text: str = UnstructuredHelper.process_document(
self.config, input_file_path, output_file_path
)
return output

return TextExtractionResult(extracted_text=extracted_text)

def test_connection(self) -> bool:
result: bool = UnstructuredHelper.test_server_connection(self.config)
Expand Down
9 changes: 6 additions & 3 deletions src/unstract/adapters/x2text/x2text_adapter.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@

from unstract.adapters.base import Adapter
from unstract.adapters.enums import AdapterTypes
from unstract.adapters.x2text.dto import TextExtractionResult


class X2TextAdapter(Adapter, ABC):
Expand Down Expand Up @@ -42,6 +43,8 @@ def process(
input_file_path: str,
output_file_path: Optional[str] = None,
**kwargs: dict[Any, Any],
) -> str:
# Overriding methods will have the actual implementation
return ""
) -> TextExtractionResult:

return TextExtractionResult(
extracted_text="extracted text", extraction_metadata=None
)

0 comments on commit cee091c

Please sign in to comment.