From 90ef0319e5af4a00346d582c1686baea4ec30eb0 Mon Sep 17 00:00:00 2001 From: jgaff Date: Tue, 30 May 2017 10:57:00 -0500 Subject: [PATCH] Release v0.1.0 Move harvesters, converters, and ingester out of the prototype phase. Update examples to 0.1.0. NOTE: Validator currently accepts missing mdf_version, but it will be required soon. --- .gitignore | 11 +- prototypes/base_style.py => base_style.py | 0 .../ab_initio_solute_database_converter.py | 0 .../amcs_converter.py | 0 .../autovasp_converter.py | 0 .../cip_converter.py | 0 .../converter_template.py | 2 + .../core_mof_converter.py | 0 .../cxidb_converter.py | 0 .../doak_strain_energies_converter.py | 0 .../fe_cr_al_oxidation_converter.py | 0 .../gw100_converter.py | 0 .../gw_soc81_converter.py | 0 .../hopv_converter.py | 0 .../jcap_benchmarking_db_converter.py | 0 .../jcap_xps_spectral_db_converter.py | 0 .../khazana_polymer_converter.py | 0 .../khazana_vasp_converter.py | 0 .../materials_commons_converter.py | 0 .../matin_converter.py | 0 .../nanomine_converter.py | 0 .../nist_atom_weight_iso_comp_converter.py | 0 .../nist_heat_transmission_converter.py | 0 .../nist_ip_converter.py | 0 .../nist_janaf_converter.py | 0 .../nist_mml_converter.py | 0 .../nist_th_ar_lamp_spectrum_converter.py | 0 .../nist_xps_db_converter.py | 0 .../nist_xray_tran_en_db_converter.py | 0 .../nrel_pv_converter.py | 0 .../oqmd_converter.py | 0 ...xygen_interstital_deformation_converter.py | 0 .../parsers/ase_parser.py | 0 .../parsers/pymatgen_parser.py | 0 .../parsers/tab_parser.py | 0 .../parsers/utils.py | 0 .../converters => converters}/paths.py | 0 .../pppdb_converter.py | 0 .../qm_mdt_c_converter.py | 0 .../sluschi_converter.py | 0 .../strain_effects_oxygen_converter.py | 0 .../surface_diffusion_converter.py | 0 .../ti_o_fitting_db_converter.py | 0 .../ti_o_meam_model_converter.py | 0 .../trinkle_elastic_fe_bcc_converter.py | 0 .../converters => converters}/validator.py | 5 + .../xafs_sl_converter.py | 0 example_converter_0.1.0/example_converter.py | 140 +++++ .../example_parser.py | 0 .../example_paths.py | 0 example_converter_0.1.0/validator_copy.py | 548 ++++++++++++++++++ .../amcs_harvester.py | 0 .../cxidb_harvester.py | 0 .../farrel_lytle_harvester.py | 0 .../jcap_xps_spectral_db_harvester.py | 0 .../materials_commons_harvester.py | 0 .../mdf_dspace_harvester.py | 0 .../nist_mml_harvester.py | 0 .../nist_xps_db_harvester.py | 0 .../oai_pmh_harvester.py | 0 .../harvesters => harvesters}/paths.py | 0 .../ingester => ingester}/data_ingester.py | 0 .../ingester => ingester}/globus_auth.py | 0 .../ingester => ingester}/globus_client.py | 0 .../ingester => ingester}/gmeta_utils.py | 0 {prototypes/ingester => ingester}/paths.py | 0 .../example_converter/example_converter.py | 105 ---- .../example_converter/validator_copy.py | 487 ---------------- prototypes/sandbox.ipynb => sandbox.ipynb | 0 .../search-demo.ipynb => search-demo.ipynb | 0 70 files changed, 700 insertions(+), 598 deletions(-) rename prototypes/base_style.py => base_style.py (100%) rename {prototypes/converters => converters}/ab_initio_solute_database_converter.py (100%) rename {prototypes/converters => converters}/amcs_converter.py (100%) rename {prototypes/converters => converters}/autovasp_converter.py (100%) rename {prototypes/converters => converters}/cip_converter.py (100%) rename {prototypes/converters => converters}/converter_template.py (98%) rename {prototypes/converters => converters}/core_mof_converter.py (100%) rename {prototypes/converters => converters}/cxidb_converter.py (100%) rename {prototypes/converters => converters}/doak_strain_energies_converter.py (100%) rename {prototypes/converters => converters}/fe_cr_al_oxidation_converter.py (100%) rename {prototypes/converters => converters}/gw100_converter.py (100%) rename {prototypes/converters => converters}/gw_soc81_converter.py (100%) rename {prototypes/converters => converters}/hopv_converter.py (100%) rename {prototypes/converters => converters}/jcap_benchmarking_db_converter.py (100%) rename {prototypes/converters => converters}/jcap_xps_spectral_db_converter.py (100%) rename {prototypes/converters => converters}/khazana_polymer_converter.py (100%) rename {prototypes/converters => converters}/khazana_vasp_converter.py (100%) rename {prototypes/converters => converters}/materials_commons_converter.py (100%) rename {prototypes/converters => converters}/matin_converter.py (100%) rename {prototypes/converters => converters}/nanomine_converter.py (100%) rename {prototypes/converters => converters}/nist_atom_weight_iso_comp_converter.py (100%) rename {prototypes/converters => converters}/nist_heat_transmission_converter.py (100%) rename {prototypes/converters => converters}/nist_ip_converter.py (100%) rename {prototypes/converters => converters}/nist_janaf_converter.py (100%) rename {prototypes/converters => converters}/nist_mml_converter.py (100%) rename {prototypes/converters => converters}/nist_th_ar_lamp_spectrum_converter.py (100%) rename {prototypes/converters => converters}/nist_xps_db_converter.py (100%) rename {prototypes/converters => converters}/nist_xray_tran_en_db_converter.py (100%) rename {prototypes/converters => converters}/nrel_pv_converter.py (100%) rename {prototypes/converters => converters}/oqmd_converter.py (100%) rename {prototypes/converters => converters}/oxygen_interstital_deformation_converter.py (100%) rename {prototypes/converters => converters}/parsers/ase_parser.py (100%) rename {prototypes/converters => converters}/parsers/pymatgen_parser.py (100%) rename {prototypes/converters => converters}/parsers/tab_parser.py (100%) rename {prototypes/converters => converters}/parsers/utils.py (100%) rename {prototypes/converters => converters}/paths.py (100%) rename {prototypes/converters => converters}/pppdb_converter.py (100%) rename {prototypes/converters => converters}/qm_mdt_c_converter.py (100%) rename {prototypes/converters => converters}/sluschi_converter.py (100%) rename {prototypes/converters => converters}/strain_effects_oxygen_converter.py (100%) rename {prototypes/converters => converters}/surface_diffusion_converter.py (100%) rename {prototypes/converters => converters}/ti_o_fitting_db_converter.py (100%) rename {prototypes/converters => converters}/ti_o_meam_model_converter.py (100%) rename {prototypes/converters => converters}/trinkle_elastic_fe_bcc_converter.py (100%) rename {prototypes/converters => converters}/validator.py (99%) rename {prototypes/converters => converters}/xafs_sl_converter.py (100%) create mode 100644 example_converter_0.1.0/example_converter.py rename {prototypes/example_converter => example_converter_0.1.0}/example_parser.py (100%) rename {prototypes/example_converter => example_converter_0.1.0}/example_paths.py (100%) create mode 100644 example_converter_0.1.0/validator_copy.py rename {prototypes/harvesters => harvesters}/amcs_harvester.py (100%) rename {prototypes/harvesters => harvesters}/cxidb_harvester.py (100%) rename {prototypes/harvesters => harvesters}/farrel_lytle_harvester.py (100%) rename {prototypes/harvesters => harvesters}/jcap_xps_spectral_db_harvester.py (100%) rename {prototypes/harvesters => harvesters}/materials_commons_harvester.py (100%) rename {prototypes/harvesters => harvesters}/mdf_dspace_harvester.py (100%) rename {prototypes/harvesters => harvesters}/nist_mml_harvester.py (100%) rename {prototypes/harvesters => harvesters}/nist_xps_db_harvester.py (100%) rename {prototypes/harvesters => harvesters}/oai_pmh_harvester.py (100%) rename {prototypes/harvesters => harvesters}/paths.py (100%) rename {prototypes/ingester => ingester}/data_ingester.py (100%) rename {prototypes/ingester => ingester}/globus_auth.py (100%) rename {prototypes/ingester => ingester}/globus_client.py (100%) rename {prototypes/ingester => ingester}/gmeta_utils.py (100%) rename {prototypes/ingester => ingester}/paths.py (100%) delete mode 100644 prototypes/example_converter/example_converter.py delete mode 100644 prototypes/example_converter/validator_copy.py rename prototypes/sandbox.ipynb => sandbox.ipynb (100%) rename prototypes/search-demo.ipynb => search-demo.ipynb (100%) diff --git a/.gitignore b/.gitignore index abf36e6..ff49f9c 100644 --- a/.gitignore +++ b/.gitignore @@ -1,14 +1,13 @@ *.pyc *.swp +*.swo *_json.pickle *.json *.ipynb_checkpoints Untitled*.ipynb -prototypes/feedstock/* -prototypes/feedstock -prototypes/datasets/* -prototypes/datasets - +feedstock/* +feedstock +datasets/* +datasets -*.swo diff --git a/prototypes/base_style.py b/base_style.py similarity index 100% rename from prototypes/base_style.py rename to base_style.py diff --git a/prototypes/converters/ab_initio_solute_database_converter.py b/converters/ab_initio_solute_database_converter.py similarity index 100% rename from prototypes/converters/ab_initio_solute_database_converter.py rename to converters/ab_initio_solute_database_converter.py diff --git a/prototypes/converters/amcs_converter.py b/converters/amcs_converter.py similarity index 100% rename from prototypes/converters/amcs_converter.py rename to converters/amcs_converter.py diff --git a/prototypes/converters/autovasp_converter.py b/converters/autovasp_converter.py similarity index 100% rename from prototypes/converters/autovasp_converter.py rename to converters/autovasp_converter.py diff --git a/prototypes/converters/cip_converter.py b/converters/cip_converter.py similarity index 100% rename from prototypes/converters/cip_converter.py rename to converters/cip_converter.py diff --git a/prototypes/converters/converter_template.py b/converters/converter_template.py similarity index 98% rename from prototypes/converters/converter_template.py rename to converters/converter_template.py index 187916e..b891906 100644 --- a/prototypes/converters/converter_template.py +++ b/converters/converter_template.py @@ -2,6 +2,7 @@ import sys from validator import Validator +# VERSION 0.1.0 # This is the template for new converters. It is not a complete converter. Incomplete parts are labelled with "TODO" # Arguments: @@ -29,6 +30,7 @@ def convert(input_path, metadata=None, verbose=False): # "cite_as": , # REQ list of strings: Complete citation(s) for this dataset. # "license": , # RCM string: License to use the dataset (preferrably a link to the actual license). +# "mdf_version": , # REQ string: The metadata version in use (see VERSION above). # "dc.title": , # REQ string: Title of dataset # "dc.creator": , # REQ string: Owner of dataset diff --git a/prototypes/converters/core_mof_converter.py b/converters/core_mof_converter.py similarity index 100% rename from prototypes/converters/core_mof_converter.py rename to converters/core_mof_converter.py diff --git a/prototypes/converters/cxidb_converter.py b/converters/cxidb_converter.py similarity index 100% rename from prototypes/converters/cxidb_converter.py rename to converters/cxidb_converter.py diff --git a/prototypes/converters/doak_strain_energies_converter.py b/converters/doak_strain_energies_converter.py similarity index 100% rename from prototypes/converters/doak_strain_energies_converter.py rename to converters/doak_strain_energies_converter.py diff --git a/prototypes/converters/fe_cr_al_oxidation_converter.py b/converters/fe_cr_al_oxidation_converter.py similarity index 100% rename from prototypes/converters/fe_cr_al_oxidation_converter.py rename to converters/fe_cr_al_oxidation_converter.py diff --git a/prototypes/converters/gw100_converter.py b/converters/gw100_converter.py similarity index 100% rename from prototypes/converters/gw100_converter.py rename to converters/gw100_converter.py diff --git a/prototypes/converters/gw_soc81_converter.py b/converters/gw_soc81_converter.py similarity index 100% rename from prototypes/converters/gw_soc81_converter.py rename to converters/gw_soc81_converter.py diff --git a/prototypes/converters/hopv_converter.py b/converters/hopv_converter.py similarity index 100% rename from prototypes/converters/hopv_converter.py rename to converters/hopv_converter.py diff --git a/prototypes/converters/jcap_benchmarking_db_converter.py b/converters/jcap_benchmarking_db_converter.py similarity index 100% rename from prototypes/converters/jcap_benchmarking_db_converter.py rename to converters/jcap_benchmarking_db_converter.py diff --git a/prototypes/converters/jcap_xps_spectral_db_converter.py b/converters/jcap_xps_spectral_db_converter.py similarity index 100% rename from prototypes/converters/jcap_xps_spectral_db_converter.py rename to converters/jcap_xps_spectral_db_converter.py diff --git a/prototypes/converters/khazana_polymer_converter.py b/converters/khazana_polymer_converter.py similarity index 100% rename from prototypes/converters/khazana_polymer_converter.py rename to converters/khazana_polymer_converter.py diff --git a/prototypes/converters/khazana_vasp_converter.py b/converters/khazana_vasp_converter.py similarity index 100% rename from prototypes/converters/khazana_vasp_converter.py rename to converters/khazana_vasp_converter.py diff --git a/prototypes/converters/materials_commons_converter.py b/converters/materials_commons_converter.py similarity index 100% rename from prototypes/converters/materials_commons_converter.py rename to converters/materials_commons_converter.py diff --git a/prototypes/converters/matin_converter.py b/converters/matin_converter.py similarity index 100% rename from prototypes/converters/matin_converter.py rename to converters/matin_converter.py diff --git a/prototypes/converters/nanomine_converter.py b/converters/nanomine_converter.py similarity index 100% rename from prototypes/converters/nanomine_converter.py rename to converters/nanomine_converter.py diff --git a/prototypes/converters/nist_atom_weight_iso_comp_converter.py b/converters/nist_atom_weight_iso_comp_converter.py similarity index 100% rename from prototypes/converters/nist_atom_weight_iso_comp_converter.py rename to converters/nist_atom_weight_iso_comp_converter.py diff --git a/prototypes/converters/nist_heat_transmission_converter.py b/converters/nist_heat_transmission_converter.py similarity index 100% rename from prototypes/converters/nist_heat_transmission_converter.py rename to converters/nist_heat_transmission_converter.py diff --git a/prototypes/converters/nist_ip_converter.py b/converters/nist_ip_converter.py similarity index 100% rename from prototypes/converters/nist_ip_converter.py rename to converters/nist_ip_converter.py diff --git a/prototypes/converters/nist_janaf_converter.py b/converters/nist_janaf_converter.py similarity index 100% rename from prototypes/converters/nist_janaf_converter.py rename to converters/nist_janaf_converter.py diff --git a/prototypes/converters/nist_mml_converter.py b/converters/nist_mml_converter.py similarity index 100% rename from prototypes/converters/nist_mml_converter.py rename to converters/nist_mml_converter.py diff --git a/prototypes/converters/nist_th_ar_lamp_spectrum_converter.py b/converters/nist_th_ar_lamp_spectrum_converter.py similarity index 100% rename from prototypes/converters/nist_th_ar_lamp_spectrum_converter.py rename to converters/nist_th_ar_lamp_spectrum_converter.py diff --git a/prototypes/converters/nist_xps_db_converter.py b/converters/nist_xps_db_converter.py similarity index 100% rename from prototypes/converters/nist_xps_db_converter.py rename to converters/nist_xps_db_converter.py diff --git a/prototypes/converters/nist_xray_tran_en_db_converter.py b/converters/nist_xray_tran_en_db_converter.py similarity index 100% rename from prototypes/converters/nist_xray_tran_en_db_converter.py rename to converters/nist_xray_tran_en_db_converter.py diff --git a/prototypes/converters/nrel_pv_converter.py b/converters/nrel_pv_converter.py similarity index 100% rename from prototypes/converters/nrel_pv_converter.py rename to converters/nrel_pv_converter.py diff --git a/prototypes/converters/oqmd_converter.py b/converters/oqmd_converter.py similarity index 100% rename from prototypes/converters/oqmd_converter.py rename to converters/oqmd_converter.py diff --git a/prototypes/converters/oxygen_interstital_deformation_converter.py b/converters/oxygen_interstital_deformation_converter.py similarity index 100% rename from prototypes/converters/oxygen_interstital_deformation_converter.py rename to converters/oxygen_interstital_deformation_converter.py diff --git a/prototypes/converters/parsers/ase_parser.py b/converters/parsers/ase_parser.py similarity index 100% rename from prototypes/converters/parsers/ase_parser.py rename to converters/parsers/ase_parser.py diff --git a/prototypes/converters/parsers/pymatgen_parser.py b/converters/parsers/pymatgen_parser.py similarity index 100% rename from prototypes/converters/parsers/pymatgen_parser.py rename to converters/parsers/pymatgen_parser.py diff --git a/prototypes/converters/parsers/tab_parser.py b/converters/parsers/tab_parser.py similarity index 100% rename from prototypes/converters/parsers/tab_parser.py rename to converters/parsers/tab_parser.py diff --git a/prototypes/converters/parsers/utils.py b/converters/parsers/utils.py similarity index 100% rename from prototypes/converters/parsers/utils.py rename to converters/parsers/utils.py diff --git a/prototypes/converters/paths.py b/converters/paths.py similarity index 100% rename from prototypes/converters/paths.py rename to converters/paths.py diff --git a/prototypes/converters/pppdb_converter.py b/converters/pppdb_converter.py similarity index 100% rename from prototypes/converters/pppdb_converter.py rename to converters/pppdb_converter.py diff --git a/prototypes/converters/qm_mdt_c_converter.py b/converters/qm_mdt_c_converter.py similarity index 100% rename from prototypes/converters/qm_mdt_c_converter.py rename to converters/qm_mdt_c_converter.py diff --git a/prototypes/converters/sluschi_converter.py b/converters/sluschi_converter.py similarity index 100% rename from prototypes/converters/sluschi_converter.py rename to converters/sluschi_converter.py diff --git a/prototypes/converters/strain_effects_oxygen_converter.py b/converters/strain_effects_oxygen_converter.py similarity index 100% rename from prototypes/converters/strain_effects_oxygen_converter.py rename to converters/strain_effects_oxygen_converter.py diff --git a/prototypes/converters/surface_diffusion_converter.py b/converters/surface_diffusion_converter.py similarity index 100% rename from prototypes/converters/surface_diffusion_converter.py rename to converters/surface_diffusion_converter.py diff --git a/prototypes/converters/ti_o_fitting_db_converter.py b/converters/ti_o_fitting_db_converter.py similarity index 100% rename from prototypes/converters/ti_o_fitting_db_converter.py rename to converters/ti_o_fitting_db_converter.py diff --git a/prototypes/converters/ti_o_meam_model_converter.py b/converters/ti_o_meam_model_converter.py similarity index 100% rename from prototypes/converters/ti_o_meam_model_converter.py rename to converters/ti_o_meam_model_converter.py diff --git a/prototypes/converters/trinkle_elastic_fe_bcc_converter.py b/converters/trinkle_elastic_fe_bcc_converter.py similarity index 100% rename from prototypes/converters/trinkle_elastic_fe_bcc_converter.py rename to converters/trinkle_elastic_fe_bcc_converter.py diff --git a/prototypes/converters/validator.py b/converters/validator.py similarity index 99% rename from prototypes/converters/validator.py rename to converters/validator.py index 6af7cf9..8b6a85e 100644 --- a/prototypes/converters/validator.py +++ b/converters/validator.py @@ -280,6 +280,11 @@ def validate_metadata(metadata, entry_type, strict=False): "req": False, "type": str }, + "mdf_version": { +# "req": True, + "req": False, + "type": str + }, "dc.title": { "req": True, "type": str diff --git a/prototypes/converters/xafs_sl_converter.py b/converters/xafs_sl_converter.py similarity index 100% rename from prototypes/converters/xafs_sl_converter.py rename to converters/xafs_sl_converter.py diff --git a/example_converter_0.1.0/example_converter.py b/example_converter_0.1.0/example_converter.py new file mode 100644 index 0000000..aa0816c --- /dev/null +++ b/example_converter_0.1.0/example_converter.py @@ -0,0 +1,140 @@ +import json +import sys +from validator_copy import Validator +import example_parser + +# VERSION 0.1.0 + +# This is an example of a converter, using sample data. +# Arguments: +# input_path (string): The file or directory where the data resides. This should not be hard-coded in the function, for portability. +# metadata (string or dict): The path to the JSON dataset metadata file, a dict containing the dataset metadata, or None to specify the metadata here. Default None. +# verbose (bool): Should the script print status messages to standard output? Default False. +# NOTE: The converter should have NO output if verbose is False, unless there is an error. +def convert(input_path, metadata=None, verbose=False): + if verbose: + print("Begin example conversion") + + # Collect the metadata + # Fields can be: + # REQ (Required, must be present) + # RCM (Recommended, should be present if possible) + # OPT (Optional, can be present if useful) + if not metadata: + dataset_metadata = { + "globus_subject": "https://materialsdatafacility.org/", # REQ string: Unique value (should be URI if possible) + "acl": ["public"], # REQ list of strings: UUID(s) of users/groups allowed to access data, or ["public"] + "mdf_source_name": "example_dataset", # REQ string: Unique name for dataset + "mdf-publish.publication.collection": "Examples", # RCM string: Collection the dataset belongs to + "mdf_data_class": "text", # RCM string: Type of data in all records in the dataset (do not provide for multi-type datasets) + + "cite_as": ["Chesterson, A.B. (01-01-1970). On the Origin of Examples."], # REQ list of strings: Complete citation(s) for this dataset. + "license": "https://creativecommons.org/licenses/by-sa/4.0/", # RCM string: License to use the dataset (preferrably a link to the actual license). + "mdf_version": "0.1.0", # REQ string: The metadata version in use (see VERSION above). + + "dc.title": "MDF Example Dataset", # REQ string: Title of dataset + "dc.creator": "MDF", # REQ string: Owner of dataset + "dc.identifier": "http://dx.doi.org/10.12345", # REQ string: Link to dataset (dataset DOI if available) + "dc.contributor.author": ["A.B. Chesterson", "Ben Blaiszik", "Jonahton Gaff"], # RCM list of strings: Author(s) of dataset + "dc.subject": ["examples", "testing", "sandbox"], # RCM list of strings: Keywords about dataset + "dc.description": "This is an example dataset for an example converter", # RCM string: Description of dataset contents + "dc.relatedidentifier": ["https://www.globus.org", "https://www.google.com"], # RCM list of strings: Link(s) to related materials (such as an article) + "dc.year": 2017 # RCM integer: Year of dataset creation + } + elif type(metadata) is str: + try: + with open(metadata, 'r') as metadata_file: + dataset_metadata = json.load(metadata_file) + except Exception as e: + sys.exit("Error: Unable to read metadata: " + repr(e)) + elif type(metadata) is dict: + dataset_metadata = metadata + else: + sys.exit("Error: Invalid metadata parameter") + + + + # Make a Validator to help write the feedstock + # You must pass the metadata to the constructor + # Each Validator instance can only be used for a single dataset + dataset_validator = Validator(dataset_metadata, strict=False) + # You can also force the Validator to treat warnings as errors with strict=True + #dataset_validator = Validator(dataset_metadata, strict=True) + + + # Get the data + # Each record should be exactly one dictionary + # It is recommended that you convert your records one at a time, but it is possible to put them all into one big list (see below) + # It is also recommended that you use a parser to help with this process if one is available for your datatype + raw_data = read_data() + + # Each record also needs its own metadata + for raw_record in raw_data: + # Using a parser when possible is recommended + record = example_parser.parse_example_single(raw_record) + + # Fields can be: + # REQ (Required, must be present) + # RCM (Recommended, should be present if possible) + # OPT (Optional, can be present if useful) + record_metadata = { + "globus_subject": "https://materialsdatafacility.org/example/" + record["id"], # REQ string: Unique value (should be URI to record if possible) + "acl": ["public"], # REQ list of strings: UUID(s) of users/groups allowed to access data, or ["public"] +# "mdf-publish.publication.collection": , # OPT string: Collection the record belongs to (if different from dataset) +# "mdf_data_class": , # OPT string: Type of data in record (if not set in dataset metadata) + "mdf-base.material_composition": record["chemical_composition"], # RCM string: Chemical composition of material in record + +# "cite_as": , # OPT list of strings: Complete citation(s) for this record (if different from dataset) +# "license": , # OPT string: License to use the record (if different from dataset) (preferrably a link to the actual license). + + "dc.title": "MDF Example - " + record["chemical_composition"], # REQ string: Title of record +# "dc.creator": , # OPT string: Owner of record (if different from dataset) + "dc.identifier": "https://materialsdatafacility.org/example/" + record["id"], # RCM string: Link to record (record webpage, if available) +# "dc.contributor.author": , # OPT list of strings: Author(s) of record (if different from dataset) + "dc.subject": ["single record example"], # OPT list of strings: Keywords about record + "dc.description": "This is an example record", # OPT string: Description of record +# "dc.relatedidentifier": , # OPT list of strings: Link(s) to related materials (if different from dataset) +# "dc.year": , # OPT integer: Year of record creation (if different from dataset) + + "data": { # RCM dictionary: Other record data (described below) + "raw": json.dumps(raw_record), # RCM string: Original data record text, if feasible + "files": {"text": "https://materialsdatafacility.org/robots.txt"}, # RCM dictionary: {file_type : uri_to_file} pairs, data files (Example: {"cif" : "https://example.org/cifs/data_file.cif"}) + + # other # RCM any JSON-valid type: Any other data fields you would like to include go in the "data" dictionary. Keys will be prepended with 'mdf_source_name:' + "useful_data": [record["useful_data_1"], record["useful_data_2"]], + "other_useful_data": record["useful_data_3"] + } + } + + # Pass each individual record to the Validator + result = dataset_validator.write_record(record_metadata) + + # Check if the Validator accepted the record, and print a message if it didn't + # If the Validator returns "success" == True, the record was written successfully + if result["success"] is not True: + print("Error:", result["message"], ":", result.get("invalid_metadata", "")) + # The Validator may return warnings if strict=False, which should be noted + if result.get("warnings", None): + print("Warnings:", result["warnings"]) + + # Alternatively, if the only way you can process your data is in one large list, you can pass the list to the Validator + # You still must add the required metadata to your records + # It is recommended to use the previous method if possible + # result = dataset_validator.write_dataset(your_records_with_metadata) + #if result["success"] is not True: + #print("Error:", result["message"]) + + # You're done! + if verbose: + print("Finished converting") + + +def read_data(): + """Dummy function as an example""" + return range(10) + + +# Optionally, you can have a default call here for testing +# The convert function may not be called in this way, so code here is primarily for testing +if __name__ == "__main__": + convert(input_path="", verbose=True) diff --git a/prototypes/example_converter/example_parser.py b/example_converter_0.1.0/example_parser.py similarity index 100% rename from prototypes/example_converter/example_parser.py rename to example_converter_0.1.0/example_parser.py diff --git a/prototypes/example_converter/example_paths.py b/example_converter_0.1.0/example_paths.py similarity index 100% rename from prototypes/example_converter/example_paths.py rename to example_converter_0.1.0/example_paths.py diff --git a/example_converter_0.1.0/validator_copy.py b/example_converter_0.1.0/validator_copy.py new file mode 100644 index 0000000..91bf507 --- /dev/null +++ b/example_converter_0.1.0/validator_copy.py @@ -0,0 +1,548 @@ +import json +from bson import ObjectId +from copy import deepcopy +import os +import re +from urllib.parse import quote +import example_paths as paths + +DICT_OF_ALL_ELEMENTS = {"Actinium": "Ac", "Silver": "Ag", "Aluminum": "Al", "Americium": "Am", "Argon": "Ar", "Arsenic": "As", "Astatine": "At", "Gold": "Au", "Boron": "B", "Barium": "Ba", "Beryllium": "Be", "Bohrium": "Bh", "Bismuth": "Bi", "Berkelium": "Bk", "Bromine": "Br", "Carbon": "C", "Calcium": "Ca", "Cadmium": "Cd", "Cerium": "Ce", "Californium": "Cf", "Chlorine": "Cl", "Curium": "Cm", "Copernicium": "Cn", "Cobalt": "Co", "Chromium": "Cr", "Cesium": "Cs", "Copper": "Cu", "Dubnium": "Db", "Darmstadtium": "Ds", "Dysprosium": "Dy", "Erbium": "Er", "Einsteinium": "Es", "Europium": "Eu", "Fluorine": "F", "Iron": "Fe", "Flerovium": "Fl", "Fermium": "Fm", "Francium": "Fr", "Gallium": "Ga", "Gadolinium": "Gd", "Germanium": "Ge", "Hydrogen": "H", "Helium": "He", "Hafnium": "Hf", "Mercury": "Hg", "Holmium": "Ho", "Hassium": "Hs", "Iodine": "I", "Indium": "In", "Iridium": "Ir", "Potassium": "K", "Krypton": "Kr", "Lanthanum": "La", "Lithium": "Li", "Lawrencium": "Lr", "Lutetium": "Lu", "Livermorium": "Lv", "Mendelevium": "Md", "Magnesium": "Mg", "Manganese": "Mn", "Molybdenum": "Mo", "Meitnerium": "Mt", "Nitrogen": "N", "Sodium": "Na", "Niobium": "Nb", "Neodymium": "Nd", "Neon": "Ne", "Nickel": "Ni", "Nobelium": "No", "Neptunium": "Np", "Oxygen": "O", "Osmium": "Os", "Phosphorus": "P", "Protactinium": "Pa", "Lead": "Pb", "Palladium": "Pd", "Promethium": "Pm", "Polonium": "Po", "Praseodymium": "Pr", "Platinum": "Pt", "Plutonium": "Pu", "Radium": "Ra", "Rubidium": "Rb", "Rhenium": "Re", "Rutherfordium": "Rf", "Roentgenium": "Rg", "Rhodium": "Rh", "Radon": "Rn", "Ruthenium": "Ru", "Sulfur": "S", "Antimony": "Sb", "Scandium": "Sc", "Selenium": "Se", "Seaborgium": "Sg", "Silicon": "Si", "Samarium": "Sm", "Tin": "Sn", "Strontium": "Sr", "Tantalum": "Ta", "Terbium": "Tb", "Technetium": "Tc", "Tellurium": "Te", "Thorium": "Th", "Titanium": "Ti", "Thallium": "Tl", "Thulium": "Tm", "Uranium": "U", "Ununoctium": "Uuo", "Ununpentium": "Uup", "Ununseptium": "Uus", "Ununtrium": "Uut", "Vanadium": "V", "Tungsten": "W", "Xenon": "Xe", "Yttrium": "Y", "Ytterbium": "Yb", "Zinc": "Zn", "Zirconium": "Zr"} + +MAX_KEYS = 20 +MAX_LIST = 5 + +QUOTE_SAFE = ":/?=#" + +#Validator class holds data about a dataset while writing to feedstock +class Validator: + #init takes dataset metadata to start processing and save another function call + def __init__(self, metadata, strict=False): + self.__feedstock = None + self.__dataset_id = None + self.__mdf_source_name = None + self.__uris = [] + + self.__strict = strict + + res = self.__write_metadata(metadata) + if not res["success"]: + raise ValueError("Invalid metadata: '" + res["message"] + "' " + str(res.get("invalid_metadata", ""))) + + #del attempts cleanup + def __del__(self): + try: + self.__feedstock.close() + except AttributeError: #Feedstock wasn't opened + pass + + #Sets metadata for dataset + def __write_metadata(self, metadata): + if self.__feedstock or self.__dataset_id or self.__mdf_source_name: #Metadata already set; cannot change + return { + "success": False, + "message": "Metadata already written for this dataset" + } + + md_val = validate_metadata(metadata, "dataset", strict=self.__strict) + if not md_val["success"]: + return { + "success": False, + "message": md_val["message"], + "invalid_metadata": md_val.get("invalid_metadata", ""), + "warnings": md_val.get("warnings", []) + } + + # Log mdf_source_name + metadata["mdf_source_name"] = metadata["mdf_source_name"].lower().replace(" ", "_") + self.__mdf_source_name = metadata["mdf_source_name"] + + # Log citation + self.__cite_as = metadata["cite_as"] + + # Log collection + self.__collection = metadata.get("mdf-publish.publication.collection", None) + + # Log default acls + self.__acl = metadata["acl"] + + # Log default license + self.__license = metadata.get("license", None) + + # Log data class + self.__data_class = metadata.get("mdf_data_class", None) + + #Open feedstock file for the first time and write metadata entry + feedstock_path = paths.feedstock + metadata["mdf_source_name"] + "_all.json" + metadata["mdf_id"] = str(ObjectId()) + metadata["mdf_node_type"] = "dataset" + try: + self.__feedstock = open(feedstock_path, 'w') + json.dump(metadata, self.__feedstock) + self.__feedstock.write("\n") + + self.__dataset_id = metadata["mdf_id"] + + return { + "success": True, + "warnings": md_val.get("warnings", []) + } + + except: + return { + "success": False, + "message": "Error: Bad metadata" + } + + #Output single record to feedstock + def write_record(self, record): + if (not self.__feedstock) or (not self.__dataset_id) or (not self.__mdf_source_name): #Metadata not set + return { + "success": False, + "message": "Metadata not written for this dataset" + } + rec_val = validate_metadata(record, "record", strict=self.__strict) + if not rec_val["success"]: + return { + "success": False, + "message": rec_val["message"], + "invalid_metadata": rec_val.get("invalid_metadata", ""), + "warnings": rec_val.get("warnings", []) + } + + # Check for duplicate URIs + if record["globus_subject"] in self.__uris: + return { + "success": False, + "message": "'globus_subject' duplicate found:" + record["globus_subject"], + "warnings": rec_val.get("warnings", []) + } + else: + self.__uris.append(record["globus_subject"]) + + # Copy/set non-user-settable metadata and dataset defaults + record["mdf_id"] = str(ObjectId()) + record["parent_id"] = self.__dataset_id + record["mdf_node_type"] = "record" + record["mdf_source_name"] = self.__mdf_source_name + + record["globus_subject"] = quote(record["globus_subject"], safe=QUOTE_SAFE) + + if record.get("dc.identifier", None): + record["dc.identifier"] = quote(record["dc.identifier"], safe=QUOTE_SAFE) + + if not record.get("cite_as", None): + record["cite_as"] = self.__cite_as + + if not record.get("acl", None): + record["acl"] = self.__acl + + if not record.get("mdf-publish.publication.collection", None) and self.__collection: + record["mdf-publish.publication.collection"] = self.__collection + + if not record.get("license", None) and self.__license: + record["license"] = self.__license + + if not record.get("mdf_data_class", None) and self.__data_class: + record["mdf_data_class"] = self.__data_class + elif self.__data_class and record.get("mdf_data_class", None) != self.__data_class: + return { + "success": False, + "message": "mdf_data_class mismatch: '" + record.get("mdf_data_class", "None") + "' does not match dataset value of '" + str(self.__data_class) + "'", + "warnings": rec_val.get("warnings", []) + } + + if record.get("mdf-base.material_composition", None): + composition = record["mdf-base.material_composition"].replace(" and ", "") + for element in DICT_OF_ALL_ELEMENTS.keys(): + composition = re.sub("(?i)"+element, DICT_OF_ALL_ELEMENTS[element], composition) + str_of_elem = "" + for char in list(composition): + if char.isupper(): #Uppercase indicates start of new element symbol + str_of_elem += " " + char + elif char.islower(): #Lowercase indicates continuation of symbol + str_of_elem += char + #Anything else is not useful (numbers, whitespace, etc.) + + list_of_elem = list(set(str_of_elem.split())) #split elements in string (on whitespace), make unique, make JSON-serializable + # If any "element" isn't in the periodic table, the entire composition is likely not a chemical formula and should not be parsed + if all([elem in DICT_OF_ALL_ELEMENTS.values() for elem in list_of_elem]): + record["mdf-base.elements"] = list_of_elem + + + new_data = {} + namespace_exempt_keys = [ + "raw", + "files" + ] + for key in namespace_exempt_keys: + if key in record.get("data", {}).keys(): + if key == "files": + new_files = {} + for fkey, fvalue in record["data"].pop("files").items(): + new_files[fkey] = quote(fvalue, safe=QUOTE_SAFE) + new_data["files"] = new_files + else: + new_data[key] = record["data"].pop(key) + for key, value in record.get("data", {}).items(): + new_data[self.__mdf_source_name + ":" + key] = value + if new_data: + record["data"] = new_data + + #Write new record to feedstock + try: + json.dump(record, self.__feedstock) + self.__feedstock.write("\n") + return { + "success" : True, + "warnings": rec_val.get("warnings", []) + } + except: + return { + "success": False, + "message": "Error: Bad record" + } + + #Output whole dataset to feedstock + #all_records must be a list of all the dataset records + def write_dataset(self, all_records): + if (not self.__feedstock) or (not self.__dataset_id) or (not self.__mdf_source_name): #Metadata not set + return { + "success": False, + "message": "Metadata not written for this dataset" + } + #Write all records to feedstock + for record in all_records: + result = self.write_record(record) + if not result["success"]: + print("Error on record: ", record) + elif result["warnings"]: + print("Warning:", result["warnings"]) + return {"success" : True} + + @property + def dataset_id(self): + return self.__dataset_id + + +# Function to validate metadata fields +# Args: +# metadata: dict, metadata to validate +# entry_type: string, type of metadata (dataset, record, etc.) +# strict: bool, warnings are errors? Default False. +def validate_metadata(metadata, entry_type, strict=False): + try: + json.loads(json.dumps(metadata)) + if type(metadata) is not dict: + raise TypeError + except TypeError: + return { + "success": False, + "message": "Metadata must be a JSON-serializable dict" + } + # valid_meta format: + # field_name: { + # "req": bool, is field required? + # "type": type, datatype + # "contains": type, for lists, type of data inside (None for any type) + # } + invalid_list = [] + warning_list = [] + if entry_type == "dataset": + # Valid dataset metadata + valid_meta = { + "globus_subject": { + "req": True, + "type": str + }, + "acl": { + "req": True, + "type": list, + "contains": str + }, + "mdf_source_name": { + "req": True, + "type": str + }, + "mdf-publish.publication.collection": { + "req": False, + "type": str + }, + "mdf_data_class": { + "req": False, + "type": str + }, + "cite_as": { + "req": True, + "type": list, + "contains": str + }, + "license": { + "req": False, + "type": str + }, + "mdf_version": { + "req": True, + "type": str + }, + "dc.title": { + "req": True, + "type": str + }, + "dc.creator": { + "req": True, + "type": str + }, + "dc.identifier": { + "req": True, + "type": str + }, + "dc.contributor.author": { + "req": False, + "type": list, + "contains": str + }, + "dc.subject": { + "req": False, + "type": list, + "contains": str + }, + "dc.description": { + "req": False, + "type": str + }, + "dc.relatedidentifier": { + "req": False, + "type": list, + "contains": str + }, + "dc.year": { + "req": False, + "type": int + } + } + # Not implemented: nist_mrr, mdf_credits + elif entry_type == "record": + # Valid record metadata + valid_meta = { + #Temp for scrolling + "scroll_id": { + "req": False, + "type": int + }, + "globus_subject": { + "req": True, + "type": str + }, + "acl": { + "req": True, + "type": list, + "contains": str + }, + "mdf-publish.publication.collection": { + "req": False, + "type": str + }, + "mdf_data_class": { + "req": False, + "type": str + }, + "mdf-base.material_composition": { + "req": False, + "type": str + }, + "cite_as": { + "req": False, + "type": list, + "contains": str + }, + "license": { + "req": False, + "type": str + }, + "dc.title": { + "req": True, + "type": str + }, + "dc.creator": { + "req": False, + "type": str + }, + "dc.identifier": { + "req": False, + "type": str + }, + "dc.contributor.author": { + "req": False, + "type": list, + "contains": str + }, + "dc.subject": { + "req": False, + "type": list, + "contains": str + }, + "dc.description": { + "req": False, + "type": str + }, + "dc.relatedidentifier": { + "req": False, + "type": list, + "contains": str + }, + "dc.year": { + "req": False, + "type": int + }, + "data": { + "req": False, + "type": dict, + "contains": None + } + } + # Additional check for data block + data_valid = validate_metadata(metadata.get("data", {}), "user_data", strict=strict) + if not data_valid["success"]: + invalid_list += data_valid["invalid_metadata"] + warning_list += data_valid["warnings"] + + elif entry_type == "user_data": + # Validate the data dict of a record's metadata + valid_meta = { + "raw": { + "req": False, + "type": str + }, + "files": { + "req": False, + "type": dict, + "contains": str + } + } + # Additional validations for data dict + res = validate_user_data(metadata) + metadata = res["value"] if res["value"] else {} + warning_list += res["warnings"] + + + else: + return { + "success": False, + "message": entry_type + " is not a valid entry type." + } + + # Check metadata + for field, reqs in valid_meta.items(): + # If the field type is not correct or field is required but is missing, the metadata is invalid. + # If the field is not required, the type will be instantiated and will subsequently pass the check. + if type(metadata.get(field, None if reqs["req"] else reqs["type"]())) is not reqs["type"] or not metadata.get(field, not reqs["req"]): + invalid_list.append({ + "field" : field, + "value" : metadata.get(field, None), + "reason" : field + (" is required and" if reqs["req"] else "") + " must be a non-empty " + reqs["type"].__name__ + }) + # If the field is a list and the contents should all be a given datatype, check the list. + elif reqs["type"] is list and reqs.get("contains"): + # Makes a list of bools. Each bool is True only is the element is not empty and is the correct type. + # If not all bools are True, the metadata is invalid. + if not all( [(type(elem) is reqs["contains"] and elem) for elem in metadata.get(field, None if reqs["req"] else reqs["type"]())] ): + invalid_list.append({ + "field" : field, + "value" : metadata.get(field, None), + "reason" : field + " must contain only non-empty " + reqs["contains"].__name__ + }) + # Same as list check, but with a dictionary. + elif reqs["type"] is dict and reqs.get("contains"): + if not all( [(type(elem) is reqs["contains"] and elem) for elem in metadata.get(field, None if reqs["req"] else reqs["type"]()).values()] ): + invalid_list.append({ + "field": field, + "value" : metadata.get(field, None), + "reason" : field + " must contain only non-empty " + reqs["contains"].__name__ + }) + + # No other metadata is allowed + disallowed_list = [x for x in metadata.keys() if x not in valid_meta.keys() and entry_type != "user_data"] + for key in disallowed_list: + invalid_list.append({ + "field" : key, + "value" : metadata.get(key, None), + "reason" : key + " is not a valid metadata field" + }) + + if strict: + invalid_list += warning_list + warning_list.clear() + + if not invalid_list: + return { + "success": True, + "warnings": warning_list + } + else: + return { + "success": False, + "invalid_metadata": invalid_list, + "message": "Invalid " + entry_type + " metadata", + "warnings": warning_list + } + + +def validate_user_data(data, total_keys=0): + warnings = [] + if type(data) is list: + if len(data) > MAX_LIST: + return { + "value": None, + "warnings": ["List of length " + str(len(data)) + " exceeds maximum length of " + str(MAX_LIST)] + } + elif any([type(elem) is list for elem in data]): + return { + "value": None, + "warnings": ["Lists containing lists are not allowed"] + } + else: + new_list = [] + for elem in data: + res = validate_user_data(elem, total_keys) + if res["warnings"]: + warnings += res["warnings"] + if res["value"]: + new_list.append(res["value"]) + total_keys = res["total_keys"] + return { + "value": new_list, + "warnings": warnings, + "total_keys": total_keys + } + elif type(data) is dict: + total_keys += len(data.keys()) + if total_keys > MAX_KEYS: + return { + "value": None, + "warnings": ["Data exceeds the total number of allowed keys (" + str(MAX_KEYS) + ")"] + } + else: + new_dict = {} + for key, value in data.items(): + res = validate_user_data(value, total_keys) + if res["warnings"]: + warnings += res["warnings"] + if res["value"]: + new_dict[key] = res["value"] + total_keys = res["total_keys"] + return { + "value": new_dict, + "warnings": warnings, + "total_keys": total_keys + } + else: + return { + "value": data, + "warnings": warnings, + "total_keys": total_keys + } + + +if __name__ == "__main__": + print("\nThis is the Validator. You can use the Validator to write valid, converted data into feedstock.") + print("There are in-depth instructions on this process in 'converter_template.py'.") diff --git a/prototypes/harvesters/amcs_harvester.py b/harvesters/amcs_harvester.py similarity index 100% rename from prototypes/harvesters/amcs_harvester.py rename to harvesters/amcs_harvester.py diff --git a/prototypes/harvesters/cxidb_harvester.py b/harvesters/cxidb_harvester.py similarity index 100% rename from prototypes/harvesters/cxidb_harvester.py rename to harvesters/cxidb_harvester.py diff --git a/prototypes/harvesters/farrel_lytle_harvester.py b/harvesters/farrel_lytle_harvester.py similarity index 100% rename from prototypes/harvesters/farrel_lytle_harvester.py rename to harvesters/farrel_lytle_harvester.py diff --git a/prototypes/harvesters/jcap_xps_spectral_db_harvester.py b/harvesters/jcap_xps_spectral_db_harvester.py similarity index 100% rename from prototypes/harvesters/jcap_xps_spectral_db_harvester.py rename to harvesters/jcap_xps_spectral_db_harvester.py diff --git a/prototypes/harvesters/materials_commons_harvester.py b/harvesters/materials_commons_harvester.py similarity index 100% rename from prototypes/harvesters/materials_commons_harvester.py rename to harvesters/materials_commons_harvester.py diff --git a/prototypes/harvesters/mdf_dspace_harvester.py b/harvesters/mdf_dspace_harvester.py similarity index 100% rename from prototypes/harvesters/mdf_dspace_harvester.py rename to harvesters/mdf_dspace_harvester.py diff --git a/prototypes/harvesters/nist_mml_harvester.py b/harvesters/nist_mml_harvester.py similarity index 100% rename from prototypes/harvesters/nist_mml_harvester.py rename to harvesters/nist_mml_harvester.py diff --git a/prototypes/harvesters/nist_xps_db_harvester.py b/harvesters/nist_xps_db_harvester.py similarity index 100% rename from prototypes/harvesters/nist_xps_db_harvester.py rename to harvesters/nist_xps_db_harvester.py diff --git a/prototypes/harvesters/oai_pmh_harvester.py b/harvesters/oai_pmh_harvester.py similarity index 100% rename from prototypes/harvesters/oai_pmh_harvester.py rename to harvesters/oai_pmh_harvester.py diff --git a/prototypes/harvesters/paths.py b/harvesters/paths.py similarity index 100% rename from prototypes/harvesters/paths.py rename to harvesters/paths.py diff --git a/prototypes/ingester/data_ingester.py b/ingester/data_ingester.py similarity index 100% rename from prototypes/ingester/data_ingester.py rename to ingester/data_ingester.py diff --git a/prototypes/ingester/globus_auth.py b/ingester/globus_auth.py similarity index 100% rename from prototypes/ingester/globus_auth.py rename to ingester/globus_auth.py diff --git a/prototypes/ingester/globus_client.py b/ingester/globus_client.py similarity index 100% rename from prototypes/ingester/globus_client.py rename to ingester/globus_client.py diff --git a/prototypes/ingester/gmeta_utils.py b/ingester/gmeta_utils.py similarity index 100% rename from prototypes/ingester/gmeta_utils.py rename to ingester/gmeta_utils.py diff --git a/prototypes/ingester/paths.py b/ingester/paths.py similarity index 100% rename from prototypes/ingester/paths.py rename to ingester/paths.py diff --git a/prototypes/example_converter/example_converter.py b/prototypes/example_converter/example_converter.py deleted file mode 100644 index ac0aa6b..0000000 --- a/prototypes/example_converter/example_converter.py +++ /dev/null @@ -1,105 +0,0 @@ -from validator_copy import Validator -import example_parser - - -# Arguments: -# input_path (string): The file or directory where the data resides. This should not be hard-coded in the function, for portability. -# verbose (bool): Should the script print status messages to standard output? Default False. -def convert(input_path, verbose=False): - - # Collect the metadata - # Fields can be: - # REQ (Required, must be present) - # RCM (Recommended, should be present if possible) - # OPT (Optional, can be present if useful) - dataset_metadata = { - "globus_subject": "https://materialsdatafacility.org/", # REQ string: Unique value (should be URI if possible) - "acl": ["public"], # REQ list of strings: UUID(s) of users/groups allowed to access data, or ["public"] - "mdf_source_name": "example_dataset", # REQ string: Unique name for dataset - "mdf-publish.publication.collection": "examples", # RCM string: Collection the dataset belongs to - - "dc.title": "MDF Example Dataset", # REQ string: Title of dataset - "dc.creator": "MDF", # REQ string: Creator of dataset - "dc.identifier": "http://dx.doi.org/10.12345", # REQ string: Link to dataset (dataset DOI if available) - "dc.contributor.author": ["Jonathon Gaff", "Ben Blaiszik"], # RCM list of strings: Author(s) of dataset - "dc.subject": ["example", "test", "converter"], # RCM list of strings: Keywords about dataset - "dc.description": "This is an example dataset for an example converter", # RCM string: Description of dataset contents - "dc.relatedidentifier": ["https://www.globus.org"], # RCM list of strings: Link(s) to related materials (such as an article) - "dc.year": 2017 # RCM integer: Year of dataset creation - } - - # Make a Validator to help write the feedstock - # You must pass the metadata to the constructor - # Each Validator instance can only be used for a single dataset - dataset_validator = Validator(dataset_metadata) - - # Get the data - # Each record should be exactly one dictionary - # It is recommended that you convert your records one at a time, but it is possible to put them all into one big list (see below) - # It is also recommended that you use a parser to help with this process if one is available for your datatype - raw_data = read_data() - - # Each record also needs its own metadata - for raw_record in raw_data: - # Using a parser when possible is recommended - record = example_parser.parse_example_single(raw_record) - - # Fields can be: - # REQ (Required, must be present) - # RCM (Recommended, should be present if possible) - # OPT (Optional, can be present if useful) - record_metadata = { - "globus_subject": "https://materialsdatafacility.org/example/" + record["id"], # REQ string: Unique value (should be URI to record if possible) - "acl": ["public"], # REQ list of strings: UUID(s) of users/groups allowed to access data, or ["public"] - "mdf-publish.publication.collection": "examples", # RCM string: Collection the record belongs to - "mdf_data_class": "text", # RCM string: Type of data in record - "mdf-base.material_composition": record["chemical_composition"], # RCM string: Chemical composition of material in record - - "dc.title": "MDF Example - " + record["chemical_composition"], # REQ string: Title of record - #"dc.creator": , # OPT string: Owner of record (if different from dataset) - "dc.identifier": "https://materialsdatafacility.org/example/" + record["id"], # RCM string: Link to record (record webpage, if available) - #"dc.contributor.author": , # OPT list of strings: Author(s) of record (if different from dataset) - #"dc.subject": , # OPT list of strings: Keywords about record - "dc.description": "This is an example record", # OPT string: Description of record - #"dc.relatedidentifier": , # OPT list of strings: Link(s) to related materials (if different from dataset) - #"dc.year": , # OPT integer: Year of record creation (if different from dataset) - - "data": { # REQ dictionary: Other record data (described below) - "raw": str(raw_record), # RCM string: Original data record text, if feasible - "files": {"text": "https://materialsdatafacility.org/robots.txt"}, # REQ dictionary: {file_type : uri_to_file} pairs, may be empty (Example: {"cif" : "https://example.org/cifs/data_file.cif"}) - - # other # RCM any JSON-valid type: Any other data fields you would like to include go in the "data" dictionary. Keys will be prepended with mdf_source_name: - "useful_data": [record["useful_data_1"], record["useful_data_2"]], - "other_useful_data": record["useful_data_3"] - } - } - # Pass each individual record to the Validator - result = dataset_validator.write_record(record_metadata) - - # Check if the Validator accepted the record, and print a message if it didn't - # If the Validator returns "success" == True, the record was written successfully - if result["success"] is not True: - print("Error:", result["message"], ":", result.get("invalid_data", "")) - - # Alternatively, if the only way you can process your data is in one large list, you can pass the list to the Validator - # You still must add the required metadata to your records - # It is recommended to use the previous method if possible - # result = dataset_validator.write_dataset(your_records_with_metadata) - #if result["success"] is not True: - #print("Error:", result["message"]) - - # TODO: Save your converter as [dataset_name]_converter.py - # You're done! - if verbose: - print("Finished converting") - - -def read_data(): - """Dummy function as an example""" - return range(10) - - -# Optionally, you can have a default call here for testing -# The convert function may not be called in this way, so code here is primarily for testing -if __name__ == "__main__": - convert(input_path="") diff --git a/prototypes/example_converter/validator_copy.py b/prototypes/example_converter/validator_copy.py deleted file mode 100644 index 2ffa0f9..0000000 --- a/prototypes/example_converter/validator_copy.py +++ /dev/null @@ -1,487 +0,0 @@ -from json import dump -from bson import ObjectId -import os -import example_paths as paths - -#Validator class holds data about a dataset while writing to feedstock -class Validator: - #init takes dataset metadata to start processing and save another function call - def __init__(self, metadata): - self.__feedstock = None - self.dataset_id = None - self.__mdf_source_name = None - - res = self.__write_metadata(metadata) - if not res["success"]: - raise ValueError("Invalid metadata: '" + res["message"]) - - #del attempts cleanup - def __del__(self): - try: - self.__feedstock.close() - except AttributeError: #Feedstock wasn't opened - pass - - #Sets metadata for dataset - def __write_metadata(self, metadata): - if self.__feedstock or self.dataset_id or self.__mdf_source_name: #Metadata already set; cannot change - return {"success" : False, "message" : "Metadata already written for this dataset"} - - md_val = validate_metadata(metadata) - if not md_val["success"]: - return {"success" : False, "message" : md_val["message"]} - - metadata["mdf_source_name"] = metadata["mdf_source_name"].lower().replace(" ", "_") - self.__mdf_source_name = metadata["mdf_source_name"] - - #Open feedstock file for the first time and write metadata entry - feedstock_path = paths.feedstock + metadata["mdf_source_name"] + "_all.json" - metadata["mdf_id"] = str(ObjectId()) - metadata["mdf_node_type"] = "dataset" - try: - self.__feedstock = open(feedstock_path, 'w') - dump(metadata, self.__feedstock) - self.__feedstock.write("\n") - - self.dataset_id = metadata["mdf_id"] - - return {"success" : True} - - except: - return {"success" : False, "message" : "Error: Bad metadata"} - - #Output single record to feedstock - def write_record(self, record): - if (not self.__feedstock) or (not self.dataset_id) or (not self.__mdf_source_name): #Metadata not set - return {"success" : False, "message" : "Metadata not written for this dataset"} - rec_val = validate_record(record) - if not rec_val["success"]: - return {"success" : False, "message" : rec_val["message"]} - - record["mdf_id"] = str(ObjectId()) - record["parent_id"] = self.dataset_id - record["mdf_node_type"] = "record" - record["mdf_source_name"] = self.__mdf_source_name - - if record.get("mdf-base.material_composition", None): - str_of_elem = "" - for char in list(record["mdf-base.material_composition"]): - if char.isupper(): #Uppercase indicates start of new element symbol - str_of_elem += " " + char - elif char.islower(): #Lowercase indicates continuation of symbol - str_of_elem += char - #Anything else is not useful (numbers, whitespace, etc.) - record["mdf-base.elements"] = list(set(str_of_elem.split())) #split elements in string (on whitespace), make unique, make JSON-serializable - - - new_data = { - "files" : record["data"].pop("files") - } - if "raw" in record["data"].keys(): - new_data["raw"] = record["data"].pop("raw") - for key, value in record["data"].items(): - new_data[self.__mdf_source_name + ":" + key] = value - record["data"] = new_data - - #Write new record to feedstock - try: - dump(record, self.__feedstock) - self.__feedstock.write("\n") - return {"success" : True} - except: - return {"success" : False, "message" : "Error: Bad record"} - - #Output whole dataset to feedstock - #all_records must be a list of all the dataset records - def write_dataset(self, all_records): - if (not self.__feedstock) or (not self.dataset_id): #Metadata not set - return {"success" : False, "message" : "Metadata not written for this dataset"} - #Write all records to feedstock - for record in all_records: - result = self.write_record(record) - if not result["success"]: - print("Error on record: ", record) - return {"success" : True} - - -#Function to validate metadata fields -def validate_metadata(metadata): - valid_list = [ - "globus_subject", - "acl", - "mdf_source_name", - "mdf-publish.publication.collection", - "dc.title", - "dc.creator", - "dc.identifier", - "dc.contributor.author", - "dc.subject", - "dc.description", - "dc.relatedidentifier", - "dc.year" - ] - invalid_list = [] - - #globus_subject must exist, and be a non-empty string - if type(metadata.get("globus_subject", None)) is not str or not metadata.get("globus_subject"): - invalid_list.append({ - "field" : "globus_subject", - "value" : metadata.get("globus_subject", None), - "reason" : "globus_subject is required and must be a string" - }) - - #acl must exist, and be a list - if type(metadata.get("acl", None)) is not list: - invalid_list.append({ - "field" : "acl", - "value" : metadata.get("acl", None), - "reason" : "acl is required and must be a list" - }) - #acl must contain only non-empty strings - elif not all( [ (type(elem) is str and elem) for elem in metadata.get("acl") ] ): - invalid_list.append({ - "field" : "acl", - "value" : metadata.get("acl", None), - "reason" : "acl must contain only non-empty strings" - }) - - #mdf_source_name must exist, and be a non-empty string - if type(metadata.get("mdf_source_name", None)) is not str or not metadata.get("mdf_source_name"): - invalid_list.append({ - "field" : "mdf_source_name", - "value" : metadata.get("mdf_source_name", None), - "reason" : "mdf_source_name is required and must be a string" - }) - - #mdf-publish.publication.collection, if it exists, must be a non-empty string - if type(metadata.get("mdf-publish.publication.collection", "")) is not str or not metadata.get("mdf-publish.publication.collection", True): - invalid_list.append({ - "field" : "mdf-publish.publication.collection", - "value" : metadata.get("mdf-publish.publication.collection", None), - "reason" : "mdf-publish.publication.collection must be a non-empty string" - }) - - #dc.title must exist, and be a non-empty string - if type(metadata.get("dc.title", None)) is not str or not metadata.get("dc.title"): - invalid_list.append({ - "field" : "dc.title", - "value" : metadata.get("dc.title", None), - "reason" : "dc.title is required and must be a string" - }) - - #dc.creator must exist, and be a non-empty string - if type(metadata.get("dc.creator", None)) is not str or not metadata.get("dc.creator"): - invalid_list.append({ - "field" : "dc.creator", - "value" : metadata.get("dc.creator", None), - "reason" : "dc.creator is required and must be a string" - }) - - #dc.identifier must exist, and be a non-empty string - if type(metadata.get("dc.identifier", None)) is not str or not metadata.get("dc.identifier"): - invalid_list.append({ - "field" : "dc.identifier", - "value" : metadata.get("dc.identifier", None), - "reason" : "dc.identifier is required and must be a string" - }) - - #dc.contributor.author, if it exists, must be a list - if type(metadata.get("dc.contributor.author", [])) is not list: - invalid_list.append({ - "field" : "dc.contributor.author", - "value" : metadata.get("dc.contributor.author", None), - "reason" : "dc.contributor.author must be a list" - }) - #dc.contributor.author, if it exists, must contain only non-empty strings - elif not all( [ (type(elem) is str and elem) for elem in metadata.get("dc.contributor.author", []) ] ): - invalid_list.append({ - "field" : "dc.contributor.author", - "value" : metadata.get("dc.contributor.author", None), - "reason" : "dc.contributor.author must contain only non-empty strings" - }) - - #dc.subject, if it exists, must be a list - if type(metadata.get("dc.subject", [])) is not list: - invalid_list.append({ - "field" : "dc.subject", - "value" : metadata.get("dc.subject", None), - "reason" : "dc.subject must be a list" - }) - #dc.subject, if it exists, must contain only non-empty strings - elif not all( [ (type(elem) is str and elem) for elem in metadata.get("dc.subject", []) ] ): - invalid_list.append({ - "field" : "dc.subject", - "value" : metadata.get("dc.subject", None), - "reason" : "dc.subject must contain only non-empty strings" - }) - - #dc.description, if it exists, must be a non-empty string - if type(metadata.get("dc.description", "")) is not str or not metadata.get("dc.identifier", True): - invalid_list.append({ - "field" : "dc.description", - "value" : metadata.get("dc.description", None), - "reason" : "dc.description must be a non-empty string" - }) - - #dc.relatedidentifier, if it exists, must be a list - if type(metadata.get("dc.relatedidentifier", [])) is not list: - invalid_list.append({ - "field" : "dc.relatedidentifier", - "value" : metadata.get("dc.relatedidentifier", None), - "reason" : "dc.relatedidentifier must be a list" - }) - #dc.relatedidentifier, if it exists, must contain only non-empty strings - elif not all( [ (type(elem) is str and elem) for elem in metadata.get("dc.relatedidentifier", []) ] ): - invalid_list.append({ - "field" : "dc.relatedidentifier", - "value" : metadata.get("dc.relatedidentifier", None), - "reason" : "dc.relatedidentifier must contain only non-empty strings" - }) - - #dc.year, if it exists, must be an int - if type(metadata.get("dc.year", 0)) is not int: - invalid_list.append({ - "field" : "dc.year", - "value" : metadata.get("dc.year", None), - "reason" : "dc.year must be an integer" - }) - - #NIST_MRR fields - #Not implemented - - #mdf_credits - #Not implemented - - #No other metadata is allowed - disallowed_list = [x for x in metadata.keys() if x not in valid_list] - for key in disallowed_list: - invalid_list.append({ - "field" : key, - "value" : metadata.get(key, None), - "reason" : key + " is not a valid metadata field" - }) - - if not invalid_list: - return { - "success" : True - } - else: - return { - "success" : False, - "invalid_metadata" : invalid_list, - "message" : "Invalid dataset metadata" - } - - -#Function to validate record fields -def validate_record(metadata): - valid_list = [ - "globus_subject", - "acl", - #"mdf_source_name", - "mdf-publish.publication.collection", - "mdf_data_class", - "mdf-base.material_composition", - "dc.title", - "dc.creator", - "dc.identifier", - "dc.contributor.author", - "dc.subject", - "dc.description", - "dc.relatedidentifier", - "dc.year", - "data" - ] - invalid_list = [] - - #globus_subject must exist, and be a non-empty string - if type(metadata.get("globus_subject", None)) is not str or not metadata.get("globus_subject"): - invalid_list.append({ - "field" : "globus_subject", - "value" : metadata.get("globus_subject", None), - "reason" : "globus_subject is required and must be a string" - }) - - #acl must exist, and be a list - if type(metadata.get("acl", None)) is not list: - invalid_list.append({ - "field" : "acl", - "value" : metadata.get("acl", None), - "reason" : "acl is required and must be a list" - }) - #acl must contain only non-empty strings - elif not all( [ (type(elem) is str and elem) for elem in metadata.get("acl") ] ): - invalid_list.append({ - "field" : "acl", - "value" : metadata.get("acl", None), - "reason" : "acl must contain only non-empty strings" - }) - -# Requirement removed -# #mdf_source_name must exist, and be a non-empty string -# if type(metadata.get("mdf_source_name", None)) is not str or not metadata.get("mdf_source_name"): -# invalid_list.append({ -# "field" : "mdf_source_name", -# "value" : metadata.get("mdf_source_name", None), -# "reason" : "mdf_source_name is required and must be a string" -# }) - - #mdf-publish.publication.collection, if it exists, must be a non-empty string - if type(metadata.get("mdf-publish.publication.collection", "")) is not str or not metadata.get("mdf-publish.publication.collection", True): - invalid_list.append({ - "field" : "mdf-publish.publication.collection", - "value" : metadata.get("mdf-publish.publication.collection", None), - "reason" : "mdf-publish.publication.collection must be a non-empty string" - }) - - #mdf_data_class, if it exists, must be a non-empty string - if type(metadata.get("mdf_data_class", "")) is not str or not metadata.get("mdf_data_class", True): - invalid_list.append({ - "field" : "mdf_data_class", - "value" : metadata.get("mdf_data_class", None), - "reason" : "mdf_data_class must be a non-empty string" - }) - - #mdf-base.material_composition, if it exists, must be a non-empty string - if type(metadata.get("mdf-base.material_composition", "")) is not str or not metadata.get("mdf-base.material_composition", True): - invalid_list.append({ - "field" : "mdf-base.material_composition", - "value" : metadata.get("mdf-base.material_composition", None), - "reason" : "mdf-base.material_composition must be a non-empty string" - }) - - #dc.title must exist, and be a non-empty string - if type(metadata.get("dc.title", None)) is not str or not metadata.get("dc.title"): - invalid_list.append({ - "field" : "dc.title", - "value" : metadata.get("dc.title", None), - "reason" : "dc.title is required and must be a string" - }) - - #dc.creator, if it exists, must be a non-empty string - if type(metadata.get("dc.creator", "")) is not str or not metadata.get("dc.creator", True): - invalid_list.append({ - "field" : "dc.creator", - "value" : metadata.get("dc.creator", None), - "reason" : "dc.creator must be a string" - }) - - #dc.identifier, if it exists, must be a non-empty string - if type(metadata.get("dc.identifier", "")) is not str or not metadata.get("dc.identifier", True): - invalid_list.append({ - "field" : "dc.identifier", - "value" : metadata.get("dc.identifier", None), - "reason" : "dc.identifier must be a string" - }) - - #dc.contributor.author, if it exists, must be a list - if type(metadata.get("dc.contributor.author", [])) is not list: - invalid_list.append({ - "field" : "dc.contributor.author", - "value" : metadata.get("dc.contributor.author", None), - "reason" : "dc.contributor.author must be a list" - }) - #dc.contributor.author, if it exists, must contain only non-empty strings - elif not all( [ (type(elem) is str and elem) for elem in metadata.get("dc.contributor.author", []) ] ): - invalid_list.append({ - "field" : "dc.contributor.author", - "value" : metadata.get("dc.contributor.author", None), - "reason" : "dc.contributor.author must contain only non-empty strings" - }) - - #dc.subject, if it exists, must be a list - if type(metadata.get("dc.subject", [])) is not list: - invalid_list.append({ - "field" : "dc.subject", - "value" : metadata.get("dc.subject", None), - "reason" : "dc.subject must be a list" - }) - #dc.subject, if it exists, must contain only non-empty strings - elif not all( [ (type(elem) is str and elem) for elem in metadata.get("dc.subject", []) ] ): - invalid_list.append({ - "field" : "dc.subject", - "value" : metadata.get("dc.subject", None), - "reason" : "dc.subject must contain only non-empty strings" - }) - - #dc.description, if it exists, must be a non-empty string - if type(metadata.get("dc.description", "")) is not str or not metadata.get("dc.identifier", True): - invalid_list.append({ - "field" : "dc.description", - "value" : metadata.get("dc.description", None), - "reason" : "dc.description must be a non-empty string" - }) - - #dc.relatedidentifier, if it exists, must be a list - if type(metadata.get("dc.relatedidentifier", [])) is not list: - invalid_list.append({ - "field" : "dc.relatedidentifier", - "value" : metadata.get("dc.relatedidentifier", None), - "reason" : "dc.relatedidentifier must be a list" - }) - #dc.relatedidentifier, if it exists, must contain only non-empty strings - elif not all( [ (type(elem) is str and elem) for elem in metadata.get("dc.relatedidentifier", []) ] ): - invalid_list.append({ - "field" : "dc.relatedidentifier", - "value" : metadata.get("dc.relatedidentifier", None), - "reason" : "dc.relatedidentifier must contain only non-empty strings" - }) - - #dc.year, if it exists, must be an int - if type(metadata.get("dc.year", 0)) is not int: - invalid_list.append({ - "field" : "dc.year", - "value" : metadata.get("dc.year", None), - "reason" : "dc.year must be an integer" - }) - - #mdf_facts - #Not implemented - - #mdf_credits - #Not implemented - - #data must exist, and be a dictionary - if type(metadata.get("data", None)) is not dict: - invalid_list.append({ - "field" : "data", - "value" : metadata.get("data", None), - "reason" : "data is required" - }) - elif type(metadata.get("data").get("raw", "")) is not str: - invalid_list.append({ - "field" : "data['raw']", - "value" : metadata.get("data").get("raw", None), - "reason" : "data['raw'] must be a string" - }) - elif type(metadata.get("data").get("files", None)) is not dict: - invalid_list.append({ - "field" : "data['files']", - "value" : metadata.get("data").get("files", None), - "reason" : "data['files'] is required and must be a dictionary (but may be empty)" - }) - - - #No other metadata is allowed - disallowed_list = [x for x in metadata.keys() if x not in valid_list] - for key in disallowed_list: - invalid_list.append({ - "field" : key, - "value" : metadata.get(key, None), - "reason" : key + " is not a valid metadata field" - }) - - if not invalid_list: - return { - "success" : True - } - else: - return { - "success" : False, - "invalid_metadata" : invalid_list, - "message" : "Invalid dataset metadata" - } - -if __name__ == "__main__": - print("\nThis is a copy of the Validator for example purposes. You can use the Validator to write valid, converted data into feedstock.") - print("There are in-depth instructions on this process in 'converter_template.py'.") diff --git a/prototypes/sandbox.ipynb b/sandbox.ipynb similarity index 100% rename from prototypes/sandbox.ipynb rename to sandbox.ipynb diff --git a/prototypes/search-demo.ipynb b/search-demo.ipynb similarity index 100% rename from prototypes/search-demo.ipynb rename to search-demo.ipynb