Skip to content

Commit

Permalink
add dataset identifier to nested docs
Browse files Browse the repository at this point in the history
  • Loading branch information
saggu committed Jan 21, 2018
1 parent 4bbabf1 commit 806a429
Show file tree
Hide file tree
Showing 2 changed files with 49 additions and 2 deletions.
4 changes: 4 additions & 0 deletions etk/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -2267,6 +2267,8 @@ def create_kg_node_extractor(ds, config, doc, parent_doc_id, doc_id=None, url=No
raise KeyError('{} not found in the config for method: {}'.format(_SEGMENT_NAME, _CREATE_KG_NODE_EXTRACTOR))
segment_name = config[_SEGMENT_NAME]

dataset_id = config.get("dataset_identifier")

if doc:
if 'nested_docs' not in doc:
doc['nested_docs'] = list()
Expand Down Expand Up @@ -2310,6 +2312,8 @@ def create_kg_node_extractor(ds, config, doc, parent_doc_id, doc_id=None, url=No

result[_DOCUMENT_ID] = doc_id
result['doc_id'] = doc_id
if dataset_id:
result["dataset_identifier"] = dataset_id

if class_type:
result['@type'] = class_type
Expand Down
47 changes: 45 additions & 2 deletions etk/unit_tests/test_create_kg_node.py
Original file line number Diff line number Diff line change
Expand Up @@ -176,8 +176,8 @@ def test_create_kg_node_text(self):
self.assertTrue(len(r['nested_docs']) == 1)
nested_doc = r['nested_docs'][0]
ce_expected = {
"actor_information": "agent 47"
}
"actor_information": "agent 47"
}

self.assertEqual(nested_doc['content_extraction'], ce_expected)
self.assertTrue('created_by' in nested_doc)
Expand Down Expand Up @@ -224,6 +224,49 @@ def test_doc_id(self):
self.assertEqual(r['knowledge_graph']['actors'][0]['value'], '47')
self.assertEqual(r['nested_docs'][0]['doc_id'], '47')

def test_create_kg_node_dataset_id(self):
doc = {
"url": "http:www.hitman.org",
"doc_id": "19B0EAB211CD1D3C63063FAB0B2937043EA1F07B5341014A80E7473BA7318D9E",
"actors": {
"name": "agent 47",
"affiliation": "International Contract Agency"
}
}

e_config = {
"document_id": "doc_id",
"data_extraction": [
{
"input_path": [
"actors.name"
],
"fields": {
"actors": {
"extractors": {
"create_kg_node_extractor": {
"config": {
"segment_name": "actor_information",
"dataset_identifier": "agent47"
}
}
}
}
}
}
]
}
c = Core(extraction_config=e_config)
r = c.process(doc)
self.assertTrue('knowledge_graph' in doc)
self.assertTrue('actors' in doc['knowledge_graph'])
self.assertTrue(len(doc['knowledge_graph']['actors']) == 1)
self.assertTrue('nested_docs' in r)
self.assertTrue(len(r['nested_docs']) == 1)
nested_doc = r['nested_docs'][0]
self.assertTrue('dataset_identifier' in nested_doc)
self.assertTrue(nested_doc["dataset_identifier"], "agent47")


if __name__ == '__main__':
unittest.main()

0 comments on commit 806a429

Please sign in to comment.