diff --git a/data_extraction/text_extraction.py b/data_extraction/text_extraction.py index 5ed87df..79f661e 100644 --- a/data_extraction/text_extraction.py +++ b/data_extraction/text_extraction.py @@ -22,11 +22,11 @@ def _return_file_content(self, filepath: str) -> str: with open(filepath, "r") as file: return file.read() - def _try_extract_text(self, filepath: str) -> str: - if self.is_txt(filepath): + def _try_extract_text(self, filepath: str, file_type) -> str: + if self.is_txt(file_type): return self._return_file_content(filepath) with open(filepath, "rb") as file: - headers = {"Content-Type": self._get_file_type(filepath)} + headers = {"Content-Type": file_type} response = requests.put(f"{self._url}/tika", data=file, headers=headers) response.encoding = "UTF-8" return response.text @@ -34,9 +34,10 @@ def _try_extract_text(self, filepath: str) -> str: def extract_text(self, filepath: str) -> str: logging.debug(f"Extracting text from {filepath}") self.check_file_exists(filepath) - self.check_file_type_supported(filepath) + file_type = self.get_file_type(filepath) + self.check_file_type_supported(file_type) try: - return self._try_extract_text(filepath) + return self._try_extract_text(filepath, file_type) except Exception as e: raise Exception("Could not extract file content") from e @@ -44,22 +45,22 @@ def check_file_exists(self, filepath: str): if not os.path.exists(filepath): raise Exception(f"File does not exists: {filepath}") - def check_file_type_supported(self, filepath: str) -> None: + def check_file_type_supported(self, found_type) -> None: if ( - not self.is_doc(filepath) - and not self.is_pdf(filepath) - and not self.is_txt(filepath) + not self.is_doc(found_type) + and not self.is_pdf(found_type) + and not self.is_txt(found_type) ): - raise Exception("Unsupported file type: " + self.get_file_type(filepath)) + raise Exception("Unsupported file type: " + found_type) - def is_pdf(self, filepath): + def is_pdf(self, found_type): """ If the file type is pdf returns True. Otherwise, returns False """ - return self.is_file_type(filepath, file_types=["application/pdf"]) + return found_type in ["application/pdf"] - def is_doc(self, filepath): + def is_doc(self, found_type): """ If the file type is doc or similar returns True. Otherwise, returns False @@ -69,14 +70,14 @@ def is_doc(self, filepath): "application/vnd.oasis.opendocument.text", "application/vnd.openxmlformats-officedocument.wordprocessingml.document", ] - return self.is_file_type(filepath, file_types) + return found_type in file_types - def is_txt(self, filepath): + def is_txt(self, found_type): """ If the file type is txt returns True. Otherwise, returns False """ - return self.is_file_type(filepath, file_types=["text/plain"]) + return found_type in ["text/plain"] def get_file_type(self, filepath): """ @@ -84,11 +85,11 @@ def get_file_type(self, filepath): """ return magic.from_file(filepath, mime=True) - def is_file_type(self, filepath, file_types): + def is_file_type(self, found_type, file_types): """ Generic method to check if a identified file type matches a given list of types """ - return self.get_file_type(filepath) in file_types + return found_type in file_types def get_apache_tika_server_url():