diff --git a/ted_sws/notice_metadata_processor/adapters/notice_metadata_normaliser.py b/ted_sws/notice_metadata_processor/adapters/notice_metadata_normaliser.py index 2dace386..e1f1187d 100644 --- a/ted_sws/notice_metadata_processor/adapters/notice_metadata_normaliser.py +++ b/ted_sws/notice_metadata_processor/adapters/notice_metadata_normaliser.py @@ -3,6 +3,7 @@ from typing import Dict, Tuple, List import re import pandas as pd +import html from ted_sws.core.model.metadata import NormalisedMetadata, LanguageTaggedString, NoticeSource from ted_sws.notice_metadata_processor.model.metadata import ExtractedMetadata @@ -42,6 +43,10 @@ mapping_registry = MappingFilesRegistry() +def get_html_compatible_string(input_string: LanguageTaggedString) -> LanguageTaggedString: + """Convert string to HTML compatible format using HTML encoding.""" + return LanguageTaggedString(text=html.escape(input_string.text), language=input_string.language) + def get_map_list_value_by_code(mapping: Dict, listing: List): result = [] for element in listing: @@ -223,15 +228,15 @@ def normalise_metadata(self, extracted_metadata: ExtractedMetadata) -> Normalise extracted_metadata = extracted_metadata metadata = { - TITLE_KEY: [title.title for title in extracted_metadata.title], + TITLE_KEY: [get_html_compatible_string(title.title) for title in extracted_metadata.title], LONG_TITLE_KEY: [ - LanguageTaggedString(text=JOIN_SEP.join( + get_html_compatible_string(LanguageTaggedString(text=JOIN_SEP.join( [ title.title_country.text, title.title_city.text, title.title.text ]), - language=title.title.language) for title in extracted_metadata.title + language=title.title.language)) for title in extracted_metadata.title ], NOTICE_NUMBER_KEY: extracted_metadata.notice_publication_number.strip(), PUBLICATION_DATE_KEY: self.iso_date_format(extracted_metadata.publication_date), @@ -315,14 +320,14 @@ def normalise_metadata(self, extracted_metadata: ExtractedMetadata) -> Normalise form_type, notice_type, legal_basis = self.get_form_type_notice_type_and_legal_basis( extracted_notice_subtype=extracted_metadata.extracted_notice_subtype) metadata = { - TITLE_KEY: [title.title for title in extracted_metadata.title], + TITLE_KEY: [get_html_compatible_string(title.title) for title in extracted_metadata.title], LONG_TITLE_KEY: [ - LanguageTaggedString(text=JOIN_SEP.join( + get_html_compatible_string(LanguageTaggedString(text=JOIN_SEP.join( [ title.title_country.text, title.title.text ]), - language=title.title.language) for title in extracted_metadata.title + language=title.title.language)) for title in extracted_metadata.title ], NOTICE_NUMBER_KEY: extracted_metadata.notice_publication_number.strip(), PUBLICATION_DATE_KEY: self.iso_date_format(extracted_metadata.publication_date), diff --git a/ted_sws/notice_packager/resources/templates/mets_xml_dmd_rdf.jinja2 b/ted_sws/notice_packager/resources/templates/mets_xml_dmd_rdf.jinja2 index cad25705..775d1467 100644 --- a/ted_sws/notice_packager/resources/templates/mets_xml_dmd_rdf.jinja2 +++ b/ted_sws/notice_packager/resources/templates/mets_xml_dmd_rdf.jinja2 @@ -21,7 +21,7 @@ {{ notice.public_number_document }} {{ notice.public_number_edition }} {% for lang in work.title %} - {{ work.title[lang] | e }} + {{ work.title[lang] }} {% endfor %} {{ work.datetime_transmission }} {# {{ work.procurement_public_issued_by_country }} @@ -44,7 +44,7 @@ {% for lang in expression.title %} - {{ expression.title[lang] | e }} + {{ expression.title[lang] }} {% endfor %} Other diff --git a/tests/test_data/notice_normalisation/spaces_in_publication_number/ef_notice_with_spaces_in_publication_number.xml b/tests/test_data/notice_normalisation/ef_html_unsafe_notice.xml similarity index 100% rename from tests/test_data/notice_normalisation/spaces_in_publication_number/ef_notice_with_spaces_in_publication_number.xml rename to tests/test_data/notice_normalisation/ef_html_unsafe_notice.xml diff --git a/tests/test_data/notice_normalisation/spaces_in_publication_number/sf_notice_with_spaces_in_publication_number.xml b/tests/test_data/notice_normalisation/sf_html_unsafe_notice.xml similarity index 100% rename from tests/test_data/notice_normalisation/spaces_in_publication_number/sf_notice_with_spaces_in_publication_number.xml rename to tests/test_data/notice_normalisation/sf_html_unsafe_notice.xml diff --git a/tests/unit/notice_metadata_processor/conftest.py b/tests/unit/notice_metadata_processor/conftest.py index ae64f1c4..fcd22160 100644 --- a/tests/unit/notice_metadata_processor/conftest.py +++ b/tests/unit/notice_metadata_processor/conftest.py @@ -31,30 +31,36 @@ def eforms_xml_notice_paths() -> List[pathlib.Path]: @pytest.fixture -def sample_ef_notice_with_spaces_in_publication_number_path() -> pathlib.Path: - return TEST_DATA_PATH / "notice_normalisation" / "spaces_in_publication_number" / "ef_notice_with_spaces_in_publication_number.xml" +def sample_ef_html_unsafe_notice_path() -> pathlib.Path: + return TEST_DATA_PATH / "notice_normalisation" / "ef_html_unsafe_notice.xml" @pytest.fixture -def sample_indexed_ef_notice_with_spaces_in_publication_number( - sample_ef_notice_with_spaces_in_publication_number_path: pathlib.Path) -> Notice: - notice: Notice = Notice(ted_id=sample_ef_notice_with_spaces_in_publication_number_path.name) +def sample_indexed_ef_html_unsafe_notice( + sample_ef_html_unsafe_notice_path: pathlib.Path) -> Notice: + notice: Notice = Notice(ted_id=sample_ef_html_unsafe_notice_path.name) notice.set_xml_manifestation( - XMLManifestation(object_data=sample_ef_notice_with_spaces_in_publication_number_path.read_text())) + XMLManifestation(object_data=sample_ef_html_unsafe_notice_path.read_text())) return index_notice(notice) @pytest.fixture -def sample_sf_notice_with_spaces_in_publication_number_path() -> pathlib.Path: - return TEST_DATA_PATH / "notice_normalisation" / "spaces_in_publication_number" / "sf_notice_with_spaces_in_publication_number.xml" +def sample_sf_html_unsafe_notice_path() -> pathlib.Path: + return TEST_DATA_PATH / "notice_normalisation" / "sf_html_unsafe_notice.xml" @pytest.fixture -def sample_indexed_sf_notice_with_spaces_in_publication_number( - sample_sf_notice_with_spaces_in_publication_number_path: pathlib.Path) -> Notice: - notice: Notice = Notice(ted_id=sample_sf_notice_with_spaces_in_publication_number_path.name) +def sample_indexed_sf_html_unsafe_notice( + sample_sf_html_unsafe_notice_path: pathlib.Path) -> Notice: + notice: Notice = Notice(ted_id=sample_sf_html_unsafe_notice_path.name) notice.set_xml_manifestation( - XMLManifestation(object_data=sample_sf_notice_with_spaces_in_publication_number_path.read_text())) + XMLManifestation(object_data=sample_sf_html_unsafe_notice_path.read_text())) return index_notice(notice) + + +@pytest.fixture +def html_incompatible_str() -> str: + """Provides a test string containing HTML incompatible characters.""" + return "Construction work & planning
" diff --git a/tests/unit/notice_metadata_processor/test_metadata_normaliser.py b/tests/unit/notice_metadata_processor/test_metadata_normaliser.py index 423d693a..dad59604 100644 --- a/tests/unit/notice_metadata_processor/test_metadata_normaliser.py +++ b/tests/unit/notice_metadata_processor/test_metadata_normaliser.py @@ -1,13 +1,17 @@ +import pathlib +from xml.etree import ElementTree +from xml.etree.ElementTree import ParseError + import pytest from ted_sws.core.model.manifestation import XMLManifestation -from ted_sws.core.model.metadata import NormalisedMetadata +from ted_sws.core.model.metadata import NormalisedMetadata, LanguageTaggedString from ted_sws.core.model.notice import NoticeStatus, Notice from ted_sws.notice_metadata_processor.adapters.notice_metadata_extractor import \ DefaultNoticeMetadataExtractor, EformsNoticeMetadataExtractor from ted_sws.notice_metadata_processor.adapters.notice_metadata_normaliser import \ DefaultNoticeMetadataNormaliser, get_map_value, FORM_NUMBER_KEY, LEGAL_BASIS_KEY, SF_NOTICE_TYPE_KEY, \ - DOCUMENT_CODE_KEY, EformsNoticeMetadataNormaliser + DOCUMENT_CODE_KEY, EformsNoticeMetadataNormaliser, get_html_compatible_string from ted_sws.notice_metadata_processor.model.metadata import ExtractedMetadata from ted_sws.notice_metadata_processor.services.metadata_constraints import filter_df_by_variables from ted_sws.notice_metadata_processor.services.metadata_normalizer import normalise_notice, normalise_notice_by_id, \ @@ -16,6 +20,8 @@ extract_and_normalise_notice_metadata from ted_sws.resources.mapping_files_registry import MappingFilesRegistry +def html_str(content: str) -> str: + return f""" {content}""" def test_metadata_normaliser_by_notice(indexed_notice): notice = normalise_notice(indexed_notice) @@ -237,13 +243,36 @@ def test_normalising_notice_out_of_index(notice_normalisation_test_data_path): xml_manifestation=XMLManifestation(object_data=broke_notice_content)) -def test_normalising_notice_with_spaces_in_notice_id(sample_indexed_ef_notice_with_spaces_in_publication_number: Notice, - sample_indexed_sf_notice_with_spaces_in_publication_number: Notice +def test_normalising_notice_with_spaces_in_notice_id(sample_indexed_ef_html_unsafe_notice: Notice, + sample_indexed_sf_html_unsafe_notice: Notice ): - normalised_ef_notice: Notice = normalise_notice(sample_indexed_ef_notice_with_spaces_in_publication_number) + normalised_ef_notice: Notice = normalise_notice(sample_indexed_ef_html_unsafe_notice) assert normalised_ef_notice.normalised_metadata.notice_publication_number.strip() == normalised_ef_notice.normalised_metadata.notice_publication_number - normalised_sf_notice: Notice = normalise_notice(sample_indexed_sf_notice_with_spaces_in_publication_number) + normalised_sf_notice: Notice = normalise_notice(sample_indexed_sf_html_unsafe_notice) + + assert normalised_sf_notice.normalised_metadata.notice_publication_number.strip() == normalised_sf_notice.normalised_metadata.notice_publication_number + + +def test_get_html_compatible_string(html_incompatible_str: str): + with pytest.raises(ParseError): + ElementTree.fromstring(html_incompatible_str) + + compatible_str: LanguageTaggedString = get_html_compatible_string(LanguageTaggedString(text=html_incompatible_str)) + + + # Parse to check if str is well-formed (HTML-safe sequences or elements) + ElementTree.fromstring(html_str(compatible_str.text)) + + +def test_normalising_notice_with_html_incompatible_title(sample_indexed_ef_html_unsafe_notice: Notice, + sample_indexed_sf_html_unsafe_notice: Notice): + + normalised_ef_notice: Notice = normalise_notice(sample_indexed_ef_html_unsafe_notice) + + [ElementTree.fromstring(html_str(title.text)) for title in normalised_ef_notice.normalised_metadata.title ] + + normalised_sf_notice: Notice = normalise_notice(sample_indexed_sf_html_unsafe_notice) - assert normalised_sf_notice.normalised_metadata.notice_publication_number.strip() == normalised_sf_notice.normalised_metadata.notice_publication_number \ No newline at end of file + [ElementTree.fromstring(html_str(title.text)) for title in normalised_sf_notice.normalised_metadata.title] diff --git a/tests/unit/notice_packager/test_template_generator.py b/tests/unit/notice_packager/test_template_generator.py index 1ee2700d..e67cbcc3 100644 --- a/tests/unit/notice_packager/test_template_generator.py +++ b/tests/unit/notice_packager/test_template_generator.py @@ -62,13 +62,13 @@ def test_mets2action_mets_xml_generator_with_wrong_action(template_sample_metada TemplateGenerator.mets2action_mets_xml_generator(template_sample_metadata) -def test_mets_dmd_rdf_has_html_safe_sequences_after_generation(sample_metadata_with_wrong_title: PackagerMetadata, - sample_mets_xml_dmd_rdf_with_wrong_title_str: str): - # Ensure parser raises error on not well-formed xml (HTML sequences or elements) - with pytest.raises(ParseError): - ElementTree.fromstring(sample_mets_xml_dmd_rdf_with_wrong_title_str) - - mets_dmd_rdf: str = TemplateGenerator.mets_xml_dmd_rdf_generator(sample_metadata_with_wrong_title) - - # Parse to check if xml is well-formed (HTML-safe sequences or elements) - ElementTree.fromstring(mets_dmd_rdf) +# def test_mets_dmd_rdf_has_html_safe_sequences_after_generation(sample_metadata_with_wrong_title: PackagerMetadata, +# sample_mets_xml_dmd_rdf_with_wrong_title_str: str): +# # Ensure parser raises error on not well-formed xml (HTML sequences or elements) +# with pytest.raises(ParseError): +# ElementTree.fromstring(sample_mets_xml_dmd_rdf_with_wrong_title_str) +# +# mets_dmd_rdf: str = TemplateGenerator.mets_xml_dmd_rdf_generator(sample_metadata_with_wrong_title) +# +# # Parse to check if xml is well-formed (HTML-safe sequences or elements) +# ElementTree.fromstring(mets_dmd_rdf)