From 03c934281777fecd3edb1d8622310bbf0839c17d Mon Sep 17 00:00:00 2001 From: ciioprof0 <116185051+ciioprof0@users.noreply.github.com> Date: Tue, 6 Aug 2024 11:11:58 -0700 Subject: [PATCH] Added clex_importer --- ling508/app/clex_importer.py | 87 ++++++++++++++++++++++++++++++++++++ tests/test_clex.pl | 62 +++++++++++++++++++++++++ 2 files changed, 149 insertions(+) create mode 100644 ling508/app/clex_importer.py create mode 100644 tests/test_clex.pl diff --git a/ling508/app/clex_importer.py b/ling508/app/clex_importer.py new file mode 100644 index 0000000..b3aa940 --- /dev/null +++ b/ling508/app/clex_importer.py @@ -0,0 +1,87 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- + +""" +Module for importing the ACE Common Lexicon (Clex) into the STIX-D Corpus Database. + +This script fetches the Clex lexicon file from the provided URI, parses the content, +and imports the lexical entries into the stixd_corpus.lexicon table. +""" + +# Import Standard Library Modules +import hashlib # Provides SHA256 hash functionality +import requests # Allows fetching files from URI +from typing import Optional, Tuple # Type hinting + +# Import Third-Party Modules +# pip install mysql-connector-python + +# Import Project-Specific Modules +from db.mysql_repository import MySQLRepository # Provides database interaction + +# Set Global Variables +CLEX_URI = "https://github.com/Attempto/Clex/blob/master/clex_lexicon.pl" +DB_CONNECTION_PARAMS = { + 'host': 'localhost', + 'user': 'your_username', + 'password': 'your_password', + 'database': 'stixd_corpus' +} + +# Define Script Classes +class ClexImporter: + def __init__(self, db_repo: MySQLRepository, clex_uri: str): + self.db_repo = db_repo + self.clex_uri = clex_uri + + def fetch_clex_file(self) -> str: + """Fetches the Clex file content from the given URI.""" + response = requests.get(self.clex_uri) + response.raise_for_status() + return response.text + + def parse_clex_line(self, line: str) -> Tuple[str, str, str, Optional[str]]: + """Parses a single line of the Clex file to extract the word_tag, word_form, and optional arguments.""" + word_tag = line.split('(')[0].strip() + parts = line.split('(')[1].split(',') + word_form = parts[0].strip().strip("'") + logical_symbol = parts[1].strip() + third_arg = parts[2].strip().strip(').') if len(parts) > 2 else None + return word_tag, word_form, logical_symbol, third_arg + + def generate_hash(self, word_tag: str, word_form: str) -> str: + """Generates a SHA256 hash for the combination of word_tag and word_form.""" + hash_object = hashlib.sha256(f"{word_tag}{word_form}".encode('utf-8')) + return hash_object.hexdigest() + + def import_clex_entries(self) -> None: + """Main method to import Clex entries into the database.""" + clex_content = self.fetch_clex_file() + for line in clex_content.splitlines(): + if not line.strip() or line.startswith('%'): + continue # Skip empty lines and comments + word_tag, word_form, logical_symbol, third_arg = self.parse_clex_line(line) + tag_form_hash = self.generate_hash(word_tag, word_form) + + if not self.db_repo.find_entry_by_id(tag_form_hash): + entry = { + 'word_tag': word_tag, + 'word_form': word_form, + 'tag_form_hash': tag_form_hash, + 'logical_symbol': logical_symbol, + 'third_arg': third_arg + } + self.db_repo.save_entry(entry) + else: + self.db_repo.link_existing_entry(tag_form_hash) + print("Clex entries imported successfully.") + +# Define Main Body of Script +if __name__ == "__main__": + db_repo = MySQLRepository(DB_CONNECTION_PARAMS) + clex_importer = ClexImporter(db_repo, CLEX_URI) + try: + clex_importer.import_clex_entries() + print("Script completed successfully.") + except Exception as e: + print(f"An error occurred: {e}") diff --git a/tests/test_clex.pl b/tests/test_clex.pl new file mode 100644 index 0000000..2a398fc --- /dev/null +++ b/tests/test_clex.pl @@ -0,0 +1,62 @@ +adv(fast, fast). +adv_comp(faster, fast). +adv_sup(fastest, fast). +adv(quickly, quickly). +adj_itr(large, large). +adj_itr_comp(larger, large). +adj_itr_sup(largest, large). +adj_itr(expensive, expensive). +adj_tr('valid-for', 'valid-for', for). +adj_tr('fond-of', 'fond-of', of). +adj_tr_comp('fonder-of', 'fond-of', of). +adj_tr_sup('fondest-of', 'fond-of', of). +adj_tr('pessimistic-about', 'pessimistic-about', about). +noun_sg(woman, woman, fem). +noun_pl(women, woman, fem). +noun_sg('credit-card', 'credit-card', neutr). +noun_pl('credit-cards', 'credit-card', neutr). +noun_sg(month, month, neutr). +noun_pl(months, month, neutr). +noun_mass(water, water, neutr). +noun_mass(fear, fear, neutr). +noun_mass(money, money, neutr). +mn_sg(kg, kg). +mn_pl(kg, kg). +mn_sg(m, m). +mn_pl(m, m). +mn_sg('°C', '°C'). +mn_pl('°C', '°C'). +pn_sg('John', 'John', masc). +pn_sg('Nokia', 'Nokia', neutr). +pndef_sg('Nile', 'Nile', neutr). +pndef_pl('United-Nations', 'United-Nations', neutr). +pn_sg('Mona-Lisa', 'Mona-Lisa', neutr). +pndef_sg('Mona-Lisa', 'Mona-Lisa', neutr). +iv_finsg(waits, wait). +iv_infpl(wait, wait). +iv_finsg('goes-away', 'go-away'). +iv_infpl('go-away', 'go-away'). +iv_finsg(walks, walk). +iv_infpl(walk, walk). +tv_finsg(knows, know). +tv_infpl(know, know). +tv_pp(known, know). +tv_finsg(likes, like). +tv_infpl(like, like). +tv_pp(liked, like). +tv_finsg('relates-to', 'relate-to'). +tv_infpl('relate-to', 'relate-to'). +tv_pp('related-to', 'relate-to'). +dv_finsg(shows, show, ''). +dv_infpl(show, show, ''). +dv_pp(shown, show, ''). +dv_finsg(shows, show, to). +dv_infpl(show, show, to). +dv_pp(shown, show, to). +dv_finsg(forgives, forgive, ''). +dv_infpl(forgive, forgive, ''). +dv_pp(forgiven, forgive, ''). +dv_finsg(succeeds, succeed, as). +dv_infpl(succeed, succeed, as). +dv_pp(succeeded, succeed, as). +prep(WordForm, LogicalSymbol). \ No newline at end of file