Skip to content

Commit

Permalink
change tests
Browse files Browse the repository at this point in the history
  • Loading branch information
duprijil committed Jan 15, 2025
1 parent 639a98c commit 2adcdbd
Show file tree
Hide file tree
Showing 7 changed files with 77 additions and 37 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
from typing import Dict, Tuple, List
import re
import pandas as pd
import html

from ted_sws.core.model.metadata import NormalisedMetadata, LanguageTaggedString, NoticeSource
from ted_sws.notice_metadata_processor.model.metadata import ExtractedMetadata
Expand Down Expand Up @@ -42,6 +43,10 @@
mapping_registry = MappingFilesRegistry()


def get_html_compatible_string(input_string: LanguageTaggedString) -> LanguageTaggedString:
"""Convert string to HTML compatible format using HTML encoding."""
return LanguageTaggedString(text=html.escape(input_string.text), language=input_string.language)

def get_map_list_value_by_code(mapping: Dict, listing: List):
result = []
for element in listing:
Expand Down Expand Up @@ -223,15 +228,15 @@ def normalise_metadata(self, extracted_metadata: ExtractedMetadata) -> Normalise
extracted_metadata = extracted_metadata

metadata = {
TITLE_KEY: [title.title for title in extracted_metadata.title],
TITLE_KEY: [get_html_compatible_string(title.title) for title in extracted_metadata.title],
LONG_TITLE_KEY: [
LanguageTaggedString(text=JOIN_SEP.join(
get_html_compatible_string(LanguageTaggedString(text=JOIN_SEP.join(
[
title.title_country.text,
title.title_city.text,
title.title.text
]),
language=title.title.language) for title in extracted_metadata.title
language=title.title.language)) for title in extracted_metadata.title
],
NOTICE_NUMBER_KEY: extracted_metadata.notice_publication_number.strip(),
PUBLICATION_DATE_KEY: self.iso_date_format(extracted_metadata.publication_date),
Expand Down Expand Up @@ -315,14 +320,14 @@ def normalise_metadata(self, extracted_metadata: ExtractedMetadata) -> Normalise
form_type, notice_type, legal_basis = self.get_form_type_notice_type_and_legal_basis(
extracted_notice_subtype=extracted_metadata.extracted_notice_subtype)
metadata = {
TITLE_KEY: [title.title for title in extracted_metadata.title],
TITLE_KEY: [get_html_compatible_string(title.title) for title in extracted_metadata.title],
LONG_TITLE_KEY: [
LanguageTaggedString(text=JOIN_SEP.join(
get_html_compatible_string(LanguageTaggedString(text=JOIN_SEP.join(
[
title.title_country.text,
title.title.text
]),
language=title.title.language) for title in extracted_metadata.title
language=title.title.language)) for title in extracted_metadata.title
],
NOTICE_NUMBER_KEY: extracted_metadata.notice_publication_number.strip(),
PUBLICATION_DATE_KEY: self.iso_date_format(extracted_metadata.publication_date),
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@
<cdm:procurement_public_number_document_in_official-journal rdf:datatype="http://www.w3.org/2001/XMLSchema#string">{{ notice.public_number_document }}</cdm:procurement_public_number_document_in_official-journal>
<cdm:procurement_public_number_edition rdf:datatype="http://www.w3.org/2001/XMLSchema#positiveInteger">{{ notice.public_number_edition }}</cdm:procurement_public_number_edition>
{% for lang in work.title %}
<cdm:work_title xml:lang="{{ lang }}">{{ work.title[lang] | e }}</cdm:work_title>
<cdm:work_title xml:lang="{{ lang }}">{{ work.title[lang] }}</cdm:work_title>
{% endfor %}
<cdm:datetime_transmission rdf:datatype="http://www.w3.org/2001/XMLSchema#dateTime">{{ work.datetime_transmission }}</cdm:datetime_transmission>
{# <cdm:procurement_public_issued_by_country>{{ work.procurement_public_issued_by_country }}</cdm:procurement_public_issued_by_country>
Expand All @@ -44,7 +44,7 @@
<rdf:type rdf:resource="http://publications.europa.eu/ontology/cdm#expression_procurement_public"/>
<cdm:expression_belongs_to_work rdf:resource="&resource;ted/{{ work.identifier }}"/>
{% for lang in expression.title %}
<cdm:expression_title xml:lang="{{ lang }}">{{ expression.title[lang] | e }}</cdm:expression_title>
<cdm:expression_title xml:lang="{{ lang }}">{{ expression.title[lang] }}</cdm:expression_title>
{% endfor %}
<cdm:expression_uses_language rdf:resource="&cellar-authority;language/{{ expression.uses_language }}"/>
<cdm:expression_procurement_public_authority-type_name rdf:datatype="http://www.w3.org/2001/XMLSchema#string">Other</cdm:expression_procurement_public_authority-type_name>
Expand Down
30 changes: 18 additions & 12 deletions tests/unit/notice_metadata_processor/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,30 +31,36 @@ def eforms_xml_notice_paths() -> List[pathlib.Path]:


@pytest.fixture
def sample_ef_notice_with_spaces_in_publication_number_path() -> pathlib.Path:
return TEST_DATA_PATH / "notice_normalisation" / "spaces_in_publication_number" / "ef_notice_with_spaces_in_publication_number.xml"
def sample_ef_html_unsafe_notice_path() -> pathlib.Path:
return TEST_DATA_PATH / "notice_normalisation" / "ef_html_unsafe_notice.xml"


@pytest.fixture
def sample_indexed_ef_notice_with_spaces_in_publication_number(
sample_ef_notice_with_spaces_in_publication_number_path: pathlib.Path) -> Notice:
notice: Notice = Notice(ted_id=sample_ef_notice_with_spaces_in_publication_number_path.name)
def sample_indexed_ef_html_unsafe_notice(
sample_ef_html_unsafe_notice_path: pathlib.Path) -> Notice:
notice: Notice = Notice(ted_id=sample_ef_html_unsafe_notice_path.name)
notice.set_xml_manifestation(
XMLManifestation(object_data=sample_ef_notice_with_spaces_in_publication_number_path.read_text()))
XMLManifestation(object_data=sample_ef_html_unsafe_notice_path.read_text()))

return index_notice(notice)


@pytest.fixture
def sample_sf_notice_with_spaces_in_publication_number_path() -> pathlib.Path:
return TEST_DATA_PATH / "notice_normalisation" / "spaces_in_publication_number" / "sf_notice_with_spaces_in_publication_number.xml"
def sample_sf_html_unsafe_notice_path() -> pathlib.Path:
return TEST_DATA_PATH / "notice_normalisation" / "sf_html_unsafe_notice.xml"


@pytest.fixture
def sample_indexed_sf_notice_with_spaces_in_publication_number(
sample_sf_notice_with_spaces_in_publication_number_path: pathlib.Path) -> Notice:
notice: Notice = Notice(ted_id=sample_sf_notice_with_spaces_in_publication_number_path.name)
def sample_indexed_sf_html_unsafe_notice(
sample_sf_html_unsafe_notice_path: pathlib.Path) -> Notice:
notice: Notice = Notice(ted_id=sample_sf_html_unsafe_notice_path.name)
notice.set_xml_manifestation(
XMLManifestation(object_data=sample_sf_notice_with_spaces_in_publication_number_path.read_text()))
XMLManifestation(object_data=sample_sf_html_unsafe_notice_path.read_text()))

return index_notice(notice)


@pytest.fixture
def html_incompatible_str() -> str:
"""Provides a test string containing HTML incompatible characters."""
return "Construction work & planning <br />"
43 changes: 36 additions & 7 deletions tests/unit/notice_metadata_processor/test_metadata_normaliser.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,17 @@
import pathlib
from xml.etree import ElementTree
from xml.etree.ElementTree import ParseError

import pytest

from ted_sws.core.model.manifestation import XMLManifestation
from ted_sws.core.model.metadata import NormalisedMetadata
from ted_sws.core.model.metadata import NormalisedMetadata, LanguageTaggedString
from ted_sws.core.model.notice import NoticeStatus, Notice
from ted_sws.notice_metadata_processor.adapters.notice_metadata_extractor import \
DefaultNoticeMetadataExtractor, EformsNoticeMetadataExtractor
from ted_sws.notice_metadata_processor.adapters.notice_metadata_normaliser import \
DefaultNoticeMetadataNormaliser, get_map_value, FORM_NUMBER_KEY, LEGAL_BASIS_KEY, SF_NOTICE_TYPE_KEY, \
DOCUMENT_CODE_KEY, EformsNoticeMetadataNormaliser
DOCUMENT_CODE_KEY, EformsNoticeMetadataNormaliser, get_html_compatible_string
from ted_sws.notice_metadata_processor.model.metadata import ExtractedMetadata
from ted_sws.notice_metadata_processor.services.metadata_constraints import filter_df_by_variables
from ted_sws.notice_metadata_processor.services.metadata_normalizer import normalise_notice, normalise_notice_by_id, \
Expand All @@ -16,6 +20,8 @@
extract_and_normalise_notice_metadata
from ted_sws.resources.mapping_files_registry import MappingFilesRegistry

def html_str(content: str) -> str:
return f"""<?xml version="1.0" encoding="UTF-8"?> <body>{content}</body>"""

def test_metadata_normaliser_by_notice(indexed_notice):
notice = normalise_notice(indexed_notice)
Expand Down Expand Up @@ -237,13 +243,36 @@ def test_normalising_notice_out_of_index(notice_normalisation_test_data_path):
xml_manifestation=XMLManifestation(object_data=broke_notice_content))


def test_normalising_notice_with_spaces_in_notice_id(sample_indexed_ef_notice_with_spaces_in_publication_number: Notice,
sample_indexed_sf_notice_with_spaces_in_publication_number: Notice
def test_normalising_notice_with_spaces_in_notice_id(sample_indexed_ef_html_unsafe_notice: Notice,
sample_indexed_sf_html_unsafe_notice: Notice
):
normalised_ef_notice: Notice = normalise_notice(sample_indexed_ef_notice_with_spaces_in_publication_number)
normalised_ef_notice: Notice = normalise_notice(sample_indexed_ef_html_unsafe_notice)

assert normalised_ef_notice.normalised_metadata.notice_publication_number.strip() == normalised_ef_notice.normalised_metadata.notice_publication_number

normalised_sf_notice: Notice = normalise_notice(sample_indexed_sf_notice_with_spaces_in_publication_number)
normalised_sf_notice: Notice = normalise_notice(sample_indexed_sf_html_unsafe_notice)

assert normalised_sf_notice.normalised_metadata.notice_publication_number.strip() == normalised_sf_notice.normalised_metadata.notice_publication_number


def test_get_html_compatible_string(html_incompatible_str: str):
with pytest.raises(ParseError):
ElementTree.fromstring(html_incompatible_str)

compatible_str: LanguageTaggedString = get_html_compatible_string(LanguageTaggedString(text=html_incompatible_str))


# Parse to check if str is well-formed (HTML-safe sequences or elements)
ElementTree.fromstring(html_str(compatible_str.text))


def test_normalising_notice_with_html_incompatible_title(sample_indexed_ef_html_unsafe_notice: Notice,
sample_indexed_sf_html_unsafe_notice: Notice):

normalised_ef_notice: Notice = normalise_notice(sample_indexed_ef_html_unsafe_notice)

[ElementTree.fromstring(html_str(title.text)) for title in normalised_ef_notice.normalised_metadata.title ]

normalised_sf_notice: Notice = normalise_notice(sample_indexed_sf_html_unsafe_notice)

assert normalised_sf_notice.normalised_metadata.notice_publication_number.strip() == normalised_sf_notice.normalised_metadata.notice_publication_number
[ElementTree.fromstring(html_str(title.text)) for title in normalised_sf_notice.normalised_metadata.title]
20 changes: 10 additions & 10 deletions tests/unit/notice_packager/test_template_generator.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,13 +62,13 @@ def test_mets2action_mets_xml_generator_with_wrong_action(template_sample_metada
TemplateGenerator.mets2action_mets_xml_generator(template_sample_metadata)


def test_mets_dmd_rdf_has_html_safe_sequences_after_generation(sample_metadata_with_wrong_title: PackagerMetadata,
sample_mets_xml_dmd_rdf_with_wrong_title_str: str):
# Ensure parser raises error on not well-formed xml (HTML sequences or elements)
with pytest.raises(ParseError):
ElementTree.fromstring(sample_mets_xml_dmd_rdf_with_wrong_title_str)

mets_dmd_rdf: str = TemplateGenerator.mets_xml_dmd_rdf_generator(sample_metadata_with_wrong_title)

# Parse to check if xml is well-formed (HTML-safe sequences or elements)
ElementTree.fromstring(mets_dmd_rdf)
# def test_mets_dmd_rdf_has_html_safe_sequences_after_generation(sample_metadata_with_wrong_title: PackagerMetadata,
# sample_mets_xml_dmd_rdf_with_wrong_title_str: str):
# # Ensure parser raises error on not well-formed xml (HTML sequences or elements)
# with pytest.raises(ParseError):
# ElementTree.fromstring(sample_mets_xml_dmd_rdf_with_wrong_title_str)
#
# mets_dmd_rdf: str = TemplateGenerator.mets_xml_dmd_rdf_generator(sample_metadata_with_wrong_title)
#
# # Parse to check if xml is well-formed (HTML-safe sequences or elements)
# ElementTree.fromstring(mets_dmd_rdf)

0 comments on commit 2adcdbd

Please sign in to comment.