Skip to content

Commit

Permalink
Merge pull request #560 from OP-TED/feature/SWS1-12
Browse files Browse the repository at this point in the history
Fix issue #549
  • Loading branch information
duprijil authored Jan 15, 2025
2 parents 6c03617 + fdfc7f6 commit adc2f3f
Show file tree
Hide file tree
Showing 9 changed files with 4,767 additions and 11 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,8 @@ def create_notice_collection_materialised_view(mongo_client: MongoClient):
"publication_date_str_ym": "$normalised_metadata.publication_date_str_ym",
"publication_date_str_ymd": "$normalised_metadata.publication_date_str_ymd",
"deduplication_report": "$rdf_manifestation.deduplication_report",
"notice_source": "$normalised_metadata.notice_source",
"eform_sdk_version": "$normalised_metadata.eform_sdk_version",
}
},
{
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
from typing import Dict, Tuple, List
import re
import pandas as pd
import html

from ted_sws.core.model.metadata import NormalisedMetadata, LanguageTaggedString, NoticeSource
from ted_sws.notice_metadata_processor.model.metadata import ExtractedMetadata
Expand Down Expand Up @@ -42,6 +43,10 @@
mapping_registry = MappingFilesRegistry()


def get_html_compatible_string(input_string: LanguageTaggedString) -> LanguageTaggedString:
"""Convert string to HTML compatible format using HTML encoding."""
return LanguageTaggedString(text=html.escape(input_string.text), language=input_string.language)

def get_map_list_value_by_code(mapping: Dict, listing: List):
result = []
for element in listing:
Expand Down Expand Up @@ -223,17 +228,17 @@ def normalise_metadata(self, extracted_metadata: ExtractedMetadata) -> Normalise
extracted_metadata = extracted_metadata

metadata = {
TITLE_KEY: [title.title for title in extracted_metadata.title],
TITLE_KEY: [get_html_compatible_string(title.title) for title in extracted_metadata.title],
LONG_TITLE_KEY: [
LanguageTaggedString(text=JOIN_SEP.join(
get_html_compatible_string(LanguageTaggedString(text=JOIN_SEP.join(
[
title.title_country.text,
title.title_city.text,
title.title.text
]),
language=title.title.language) for title in extracted_metadata.title
language=title.title.language)) for title in extracted_metadata.title
],
NOTICE_NUMBER_KEY: extracted_metadata.notice_publication_number,
NOTICE_NUMBER_KEY: extracted_metadata.notice_publication_number.strip(),
PUBLICATION_DATE_KEY: self.iso_date_format(extracted_metadata.publication_date),
OJS_NUMBER_KEY: extracted_metadata.ojs_issue_number,
OJS_TYPE_KEY: extracted_metadata.ojs_type if extracted_metadata.ojs_type else "S",
Expand Down Expand Up @@ -315,16 +320,16 @@ def normalise_metadata(self, extracted_metadata: ExtractedMetadata) -> Normalise
form_type, notice_type, legal_basis = self.get_form_type_notice_type_and_legal_basis(
extracted_notice_subtype=extracted_metadata.extracted_notice_subtype)
metadata = {
TITLE_KEY: [title.title for title in extracted_metadata.title],
TITLE_KEY: [get_html_compatible_string(title.title) for title in extracted_metadata.title],
LONG_TITLE_KEY: [
LanguageTaggedString(text=JOIN_SEP.join(
get_html_compatible_string(LanguageTaggedString(text=JOIN_SEP.join(
[
title.title_country.text,
title.title.text
]),
language=title.title.language) for title in extracted_metadata.title
language=title.title.language)) for title in extracted_metadata.title
],
NOTICE_NUMBER_KEY: extracted_metadata.notice_publication_number,
NOTICE_NUMBER_KEY: extracted_metadata.notice_publication_number.strip(),
PUBLICATION_DATE_KEY: self.iso_date_format(extracted_metadata.publication_date),
OJS_NUMBER_KEY: extracted_metadata.ojs_issue_number,
OJS_TYPE_KEY: extracted_metadata.ojs_type if extracted_metadata.ojs_type else "S",
Expand Down
395 changes: 395 additions & 0 deletions tests/test_data/notice_normalisation/ef_html_unsafe_notice.xml

Large diffs are not rendered by default.

4,135 changes: 4,135 additions & 0 deletions tests/test_data/notice_normalisation/sf_html_unsafe_notice.xml

Large diffs are not rendered by default.

Original file line number Diff line number Diff line change
@@ -0,0 +1,70 @@
{
"notice": {
"id": "003545_2021",
"public_number_document": "003545",
"public_number_edition": "2021004"
},
"mets": {
"languages": [
"en"
],
"revision": "0",
"type": "create",
"profile": "http://publications.europa.eu/resource/mets/op-sip-profile_002",
"createdate": "2023-03-09T18:28:54.804225",
"document_id": "",
"dmd_id": "dmd_2021_S_004_003545_0_001",
"dmd_mdtype": "OTHER",
"dmd_othermdtype": "INSTANCE",
"dmd_href": "2021_S_004_003545_0.mets.xml.dmd.rdf",
"tmd_id": "tmd_2021_S_004_003545_0_001",
"tmd_href": "2021_S_004_003545_0.tmd.rdf",
"tmd_mdtype": "OTHER",
"tmd_othermdtype": "INSTANCE",
"file_id": "file_2021_S_004_003545_0_001",
"notice_file_href": "2021_S_004_003545_0.notice.rdf",
"notice_file_mimetype": "application/rdf+xml",
"notice_file_checksum": "00e2c0570f2d9f00c71c3d8009b8bec5a530167a01ebb473e67be5e97383cdc5",
"notice_file_checksum_type": "SHA-256"
},
"work": {
"identifier": "2021_S_004_003545",
"oj_identifier": "JOS_2021_004_R_003545",
"cdm_rdf_type": "procurement_public",
"resource_type": "PROCUREMENT_NOTICE",
"uri": "http://data.europa.eu/a4g/resource/2021/003545_2021",
"do_not_index": "true",
"date_document": "2021-01-07",
"created_by_agent": "EURUN",
"dataset_published_by_agent": "EURUN",
"datetime_transmission": "2023-03-09T18:28:54.806241",
"title": {
"en": "Construction work & planning",
"ro": "Lucrari de constructie <br /> si planificare"
},
"date_creation": "2023-03-09",
"concept_type_dataset": "TEST_DATA",
"dataset_version": "20230309-0",
"dataset_keyword": [
"eProcurement",
"notice"
],
"dataset_has_frequency_publication_frequency": "OTHER",
"procurement_public_issued_by_country": "CZ",
"procurement_public_url_etendering": []
},
"expression": {
"identifier": "2021_S_004_003545.MUL",
"title": {
"en": " eProcurement notice 2021_S_004_003545 "
},
"uses_language": "MUL"
},
"manifestation": {
"identifier": "2021_S_004_003545.MUL.rdf",
"type": "rdf_epo",
"date_publication": "2021-01-07",
"distribution_has_status_distribution_status": "COMPLETED",
"distribution_has_media_type_concept_media_type": "RDF"
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE rdf:RDF [
<!ENTITY % cellarEntities PUBLIC
"-//PO-RESOURCE//ENTITIES CELLAR cdm model 1.0//EN"
"/home/metaconv/metaconv_components/components/common/data/cellar_uris.ent">
%cellarEntities;
]>
<rdf:RDF xmlns:cdm="http://publications.europa.eu/ontology/cdm#"
xmlns:dct="http://purl.org/dc/terms/"
xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#">

<cdm:work rdf:about="&resource;ted/2021_S_004_003545">

<rdf:type rdf:resource="http://publications.europa.eu/ontology/cdm#procurement_public"/>
<cdm:work_id_document rdf:datatype="http://www.w3.org/2001/XMLSchema#string">ted:2021_S_004_003545</cdm:work_id_document>
<cdm:work_id_document rdf:datatype="http://www.w3.org/2001/XMLSchema#string">oj:JOS_2021_004_R_003545</cdm:work_id_document>
<cdm:work_has_resource-type rdf:resource="http://publications.europa.eu/resource/authority/resource-type/PROCUREMENT_NOTICE"/>
<cdm:do_not_index rdf:datatype="http://www.w3.org/2001/XMLSchema#boolean">true</cdm:do_not_index>
<cdm:work_date_document rdf:datatype="http://www.w3.org/2001/XMLSchema#date">2021-01-07</cdm:work_date_document>
<cdm:work_created_by_agent rdf:resource="&cellar-authority;corporate-body/EURUN"/>
<cdm:procurement_public_number_document_in_official-journal rdf:datatype="http://www.w3.org/2001/XMLSchema#string">003545</cdm:procurement_public_number_document_in_official-journal>
<cdm:procurement_public_number_edition rdf:datatype="http://www.w3.org/2001/XMLSchema#positiveInteger">2021004</cdm:procurement_public_number_edition>

<cdm:work_title xml:lang="en">Construction work & planning</cdm:work_title>

<cdm:work_title xml:lang="ro">Lucrari de constructie <br /> si planificare</cdm:work_title>

<cdm:datetime_transmission rdf:datatype="http://www.w3.org/2001/XMLSchema#dateTime">2023-03-09T18:28:54.806241</cdm:datetime_transmission>


</cdm:work>

<cdm:expression rdf:about="&resource;ted/2021_S_004_003545.MUL">

<rdf:type rdf:resource="http://publications.europa.eu/ontology/cdm#expression_procurement_public"/>
<cdm:expression_belongs_to_work rdf:resource="&resource;ted/2021_S_004_003545"/>

<cdm:expression_title xml:lang="en"> eProcurement notice 2021_S_004_003545 </cdm:expression_title>

<cdm:expression_uses_language rdf:resource="&cellar-authority;language/MUL"/>
<cdm:expression_procurement_public_authority-type_name rdf:datatype="http://www.w3.org/2001/XMLSchema#string">Other</cdm:expression_procurement_public_authority-type_name>
</cdm:expression>

<cdm:manifestation_distribution rdf:about="&resource;ted/2021_S_004_003545.MUL.rdf">

<cdm:manifestation_manifests_expression rdf:resource="&resource;ted/2021_S_004_003545.MUL"/>
<cdm:manifestation_type rdf:datatype="http://www.w3.org/2001/XMLSchema#string">rdf_epo</cdm:manifestation_type>
<cdm:manifestation_date_publication rdf:datatype="http://www.w3.org/2001/XMLSchema#date">2021-01-07</cdm:manifestation_date_publication>
<cdm:manifestation_distribution_has_status_distribution_status rdf:resource="http://publications.europa.eu/resource/authority/distribution-status/COMPLETED"/>
<cdm:manifestation_distribution_has_media_type_concept_media_type rdf:resource="http://publications.europa.eu/resource/authority/file-type/RDF"/>
</cdm:manifestation_distribution>
</rdf:RDF>
39 changes: 39 additions & 0 deletions tests/unit/notice_metadata_processor/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,9 @@

import pytest

from ted_sws.core.model.manifestation import XMLManifestation
from ted_sws.core.model.notice import Notice
from ted_sws.data_sampler.services.notice_xml_indexer import index_notice
from tests import TEST_DATA_PATH


Expand All @@ -25,3 +28,39 @@ def notice_normalisation_test_data_path():
def eforms_xml_notice_paths() -> List[pathlib.Path]:
eforms_xml_notices_path = TEST_DATA_PATH / "eforms_samples"
return list(eforms_xml_notices_path.glob("**/*.xml"))


@pytest.fixture
def sample_ef_html_unsafe_notice_path() -> pathlib.Path:
return TEST_DATA_PATH / "notice_normalisation" / "ef_html_unsafe_notice.xml"


@pytest.fixture
def sample_indexed_ef_html_unsafe_notice(
sample_ef_html_unsafe_notice_path: pathlib.Path) -> Notice:
notice: Notice = Notice(ted_id=sample_ef_html_unsafe_notice_path.name)
notice.set_xml_manifestation(
XMLManifestation(object_data=sample_ef_html_unsafe_notice_path.read_text()))

return index_notice(notice)


@pytest.fixture
def sample_sf_html_unsafe_notice_path() -> pathlib.Path:
return TEST_DATA_PATH / "notice_normalisation" / "sf_html_unsafe_notice.xml"


@pytest.fixture
def sample_indexed_sf_html_unsafe_notice(
sample_sf_html_unsafe_notice_path: pathlib.Path) -> Notice:
notice: Notice = Notice(ted_id=sample_sf_html_unsafe_notice_path.name)
notice.set_xml_manifestation(
XMLManifestation(object_data=sample_sf_html_unsafe_notice_path.read_text()))

return index_notice(notice)


@pytest.fixture
def html_incompatible_str() -> str:
"""Provides a test string containing HTML incompatible characters."""
return "Construction work & planning <br />"
47 changes: 44 additions & 3 deletions tests/unit/notice_metadata_processor/test_metadata_normaliser.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,17 @@
import pathlib
from xml.etree import ElementTree
from xml.etree.ElementTree import ParseError

import pytest

from ted_sws.core.model.manifestation import XMLManifestation
from ted_sws.core.model.metadata import NormalisedMetadata
from ted_sws.core.model.notice import NoticeStatus
from ted_sws.core.model.metadata import NormalisedMetadata, LanguageTaggedString
from ted_sws.core.model.notice import NoticeStatus, Notice
from ted_sws.notice_metadata_processor.adapters.notice_metadata_extractor import \
DefaultNoticeMetadataExtractor, EformsNoticeMetadataExtractor
from ted_sws.notice_metadata_processor.adapters.notice_metadata_normaliser import \
DefaultNoticeMetadataNormaliser, get_map_value, FORM_NUMBER_KEY, LEGAL_BASIS_KEY, SF_NOTICE_TYPE_KEY, \
DOCUMENT_CODE_KEY, EformsNoticeMetadataNormaliser
DOCUMENT_CODE_KEY, EformsNoticeMetadataNormaliser, get_html_compatible_string
from ted_sws.notice_metadata_processor.model.metadata import ExtractedMetadata
from ted_sws.notice_metadata_processor.services.metadata_constraints import filter_df_by_variables
from ted_sws.notice_metadata_processor.services.metadata_normalizer import normalise_notice, normalise_notice_by_id, \
Expand All @@ -16,6 +20,8 @@
extract_and_normalise_notice_metadata
from ted_sws.resources.mapping_files_registry import MappingFilesRegistry

def html_str(content: str) -> str:
return f"""<?xml version="1.0" encoding="UTF-8"?> <body>{content}</body>"""

def test_metadata_normaliser_by_notice(indexed_notice):
notice = normalise_notice(indexed_notice)
Expand Down Expand Up @@ -235,3 +241,38 @@ def test_normalising_notice_out_of_index(notice_normalisation_test_data_path):
with pytest.raises(Exception):
extract_and_normalise_notice_metadata(
xml_manifestation=XMLManifestation(object_data=broke_notice_content))


def test_normalising_notice_with_spaces_in_notice_id(sample_indexed_ef_html_unsafe_notice: Notice,
sample_indexed_sf_html_unsafe_notice: Notice
):
normalised_ef_notice: Notice = normalise_notice(sample_indexed_ef_html_unsafe_notice)

assert normalised_ef_notice.normalised_metadata.notice_publication_number.strip() == normalised_ef_notice.normalised_metadata.notice_publication_number

normalised_sf_notice: Notice = normalise_notice(sample_indexed_sf_html_unsafe_notice)

assert normalised_sf_notice.normalised_metadata.notice_publication_number.strip() == normalised_sf_notice.normalised_metadata.notice_publication_number


def test_get_html_compatible_string(html_incompatible_str: str):
with pytest.raises(ParseError):
ElementTree.fromstring(html_incompatible_str)

compatible_str: LanguageTaggedString = get_html_compatible_string(LanguageTaggedString(text=html_incompatible_str))


# Parse to check if str is well-formed (HTML-safe sequences or elements)
ElementTree.fromstring(html_str(compatible_str.text))


def test_normalising_notice_with_html_incompatible_title(sample_indexed_ef_html_unsafe_notice: Notice,
sample_indexed_sf_html_unsafe_notice: Notice):

normalised_ef_notice: Notice = normalise_notice(sample_indexed_ef_html_unsafe_notice)

[ElementTree.fromstring(html_str(title.text)) for title in normalised_ef_notice.normalised_metadata.title ]

normalised_sf_notice: Notice = normalise_notice(sample_indexed_sf_html_unsafe_notice)

[ElementTree.fromstring(html_str(title.text)) for title in normalised_sf_notice.normalised_metadata.title]
17 changes: 17 additions & 0 deletions tests/unit/notice_packager/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,21 @@ def template_sample_metadata_json() -> Dict:
return json.load((TEST_DATA_PATH / "notice_packager" / "template_metadata.json").open())


@pytest.fixture
def sample_metadata_with_wrong_title_json() -> Dict:
return json.load((TEST_DATA_PATH / "notice_packager" / "wrong_title" / "metadata_with_wrong_title.json").open())


@pytest.fixture
def sample_mets_xml_dmd_rdf_with_wrong_title_str() -> str:
return (TEST_DATA_PATH / "notice_packager" / "wrong_title" / "mets_with_wrong_title.mets.xml.dmd.rdf").read_text()


@pytest.fixture
def sample_metadata_with_wrong_title(sample_metadata_with_wrong_title_json) -> PackagerMetadata:
return PackagerMetadata(**sample_metadata_with_wrong_title_json)


@pytest.fixture
def template_sample_metadata(template_sample_metadata_json) -> PackagerMetadata:
return PackagerMetadata(**template_sample_metadata_json)
Expand All @@ -54,6 +69,7 @@ def template_sample_expression(template_sample_metadata) -> ExpressionMetadata:
def template_sample_manifestation(template_sample_metadata) -> ManifestationMetadata:
return template_sample_metadata.manifestation


# template_metadata END


Expand All @@ -67,6 +83,7 @@ def notice_sample_metadata(notice_2018) -> NormalisedMetadata:

return normalised_metadata


# notice_metadata END


Expand Down

0 comments on commit adc2f3f

Please sign in to comment.