Skip to content

Commit

Permalink
Support using .docx files (#281)
Browse files Browse the repository at this point in the history
Co-authored-by: Philip Meier <github.pmeier@posteo.de>
  • Loading branch information
paskett and pmeier authored Jan 22, 2024
1 parent 61e8d5f commit 008c458
Show file tree
Hide file tree
Showing 4 changed files with 58 additions and 0 deletions.
2 changes: 2 additions & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,7 @@ all = [
"lancedb>=0.2",
"pyarrow",
"pymupdf>=1.23.6",
"python-docx",
"tiktoken",
]

Expand Down Expand Up @@ -141,6 +142,7 @@ module = [
"lancedb",
"param",
"pyarrow",
"docx",
"sentence_transformers",
]
ignore_missing_imports = true
Expand Down
2 changes: 2 additions & 0 deletions ragna/core/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
"Document",
"DocumentHandler",
"DocumentUploadParameters",
"DocxDocumentHandler",
"EnvVarRequirement",
"LocalDocument",
"Message",
Expand Down Expand Up @@ -33,6 +34,7 @@
Document,
DocumentHandler,
DocumentUploadParameters,
DocxDocumentHandler,
LocalDocument,
Page,
PdfDocumentHandler,
Expand Down
32 changes: 32 additions & 0 deletions ragna/core/_document.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
from __future__ import annotations

import abc
import io
import os
import secrets
import time
Expand Down Expand Up @@ -269,3 +270,34 @@ def extract_pages(self, document: Document) -> Iterator[Page]:
) as document:
for number, page in enumerate(document, 1):
yield Page(text=page.get_text(sort=True), number=number)


@DOCUMENT_HANDLERS.load_if_available
class DocxDocumentHandler(DocumentHandler):
"""Document handler for `.docx` documents.
!!! note
This does *not* extract text from headers or footers.
!!! info "Package requirements"
- [`python-docx`](https://github.com/python-openxml/python-docx)
"""

@classmethod
def requirements(cls) -> list[Requirement]:
return [PackageRequirement("python-docx")]

@classmethod
def supported_suffixes(cls) -> list[str]:
return [".docx"]

def extract_pages(self, document: Document) -> Iterator[Page]:
import docx

document_docx = docx.Document(io.BytesIO(document.read()))
for paragraph in document_docx.paragraphs:
text = paragraph.text
if len(text) > 0:
yield Page(text=text)
22 changes: 22 additions & 0 deletions tests/core/test_document.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
import docx

from ragna.core import DocxDocumentHandler, LocalDocument


def get_docx_document(tmp_path, docx_text):
document = docx.Document()
document.add_heading(docx_text)
document.add_paragraph(docx_text)
path = tmp_path / "test_document.docx"
document.save(path)
return LocalDocument.from_path(path)


def test_docx(tmp_path):
docx_text = "ragna is neat!"
tmp_docx_document = get_docx_document(tmp_path, docx_text)
assert isinstance(tmp_docx_document.handler, DocxDocumentHandler)
pages = list(tmp_docx_document.extract_pages())
assert len(pages) == 2
for page in pages:
assert page.text == docx_text

0 comments on commit 008c458

Please sign in to comment.