Skip to content

Commit

Permalink
Added clex_importer
Browse files Browse the repository at this point in the history
  • Loading branch information
ciioprof0 committed Aug 6, 2024
1 parent 8112a5b commit 03c9342
Show file tree
Hide file tree
Showing 2 changed files with 149 additions and 0 deletions.
87 changes: 87 additions & 0 deletions ling508/app/clex_importer.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,87 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-

"""
Module for importing the ACE Common Lexicon (Clex) into the STIX-D Corpus Database.
This script fetches the Clex lexicon file from the provided URI, parses the content,
and imports the lexical entries into the stixd_corpus.lexicon table.
"""

# Import Standard Library Modules
import hashlib # Provides SHA256 hash functionality
import requests # Allows fetching files from URI
from typing import Optional, Tuple # Type hinting

# Import Third-Party Modules
# pip install mysql-connector-python

# Import Project-Specific Modules
from db.mysql_repository import MySQLRepository # Provides database interaction

# Set Global Variables
CLEX_URI = "https://github.com/Attempto/Clex/blob/master/clex_lexicon.pl"
DB_CONNECTION_PARAMS = {
'host': 'localhost',
'user': 'your_username',
'password': 'your_password',
'database': 'stixd_corpus'
}

# Define Script Classes
class ClexImporter:
def __init__(self, db_repo: MySQLRepository, clex_uri: str):
self.db_repo = db_repo
self.clex_uri = clex_uri

def fetch_clex_file(self) -> str:
"""Fetches the Clex file content from the given URI."""
response = requests.get(self.clex_uri)
response.raise_for_status()
return response.text

def parse_clex_line(self, line: str) -> Tuple[str, str, str, Optional[str]]:
"""Parses a single line of the Clex file to extract the word_tag, word_form, and optional arguments."""
word_tag = line.split('(')[0].strip()
parts = line.split('(')[1].split(',')
word_form = parts[0].strip().strip("'")
logical_symbol = parts[1].strip()
third_arg = parts[2].strip().strip(').') if len(parts) > 2 else None
return word_tag, word_form, logical_symbol, third_arg

def generate_hash(self, word_tag: str, word_form: str) -> str:
"""Generates a SHA256 hash for the combination of word_tag and word_form."""
hash_object = hashlib.sha256(f"{word_tag}{word_form}".encode('utf-8'))
return hash_object.hexdigest()

def import_clex_entries(self) -> None:
"""Main method to import Clex entries into the database."""
clex_content = self.fetch_clex_file()
for line in clex_content.splitlines():
if not line.strip() or line.startswith('%'):
continue # Skip empty lines and comments
word_tag, word_form, logical_symbol, third_arg = self.parse_clex_line(line)
tag_form_hash = self.generate_hash(word_tag, word_form)

if not self.db_repo.find_entry_by_id(tag_form_hash):
entry = {
'word_tag': word_tag,
'word_form': word_form,
'tag_form_hash': tag_form_hash,
'logical_symbol': logical_symbol,
'third_arg': third_arg
}
self.db_repo.save_entry(entry)
else:
self.db_repo.link_existing_entry(tag_form_hash)
print("Clex entries imported successfully.")

# Define Main Body of Script
if __name__ == "__main__":
db_repo = MySQLRepository(DB_CONNECTION_PARAMS)
clex_importer = ClexImporter(db_repo, CLEX_URI)
try:
clex_importer.import_clex_entries()
print("Script completed successfully.")
except Exception as e:
print(f"An error occurred: {e}")
62 changes: 62 additions & 0 deletions tests/test_clex.pl
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
adv(fast, fast).
adv_comp(faster, fast).
adv_sup(fastest, fast).
adv(quickly, quickly).
adj_itr(large, large).
adj_itr_comp(larger, large).
adj_itr_sup(largest, large).
adj_itr(expensive, expensive).
adj_tr('valid-for', 'valid-for', for).
adj_tr('fond-of', 'fond-of', of).
adj_tr_comp('fonder-of', 'fond-of', of).
adj_tr_sup('fondest-of', 'fond-of', of).
adj_tr('pessimistic-about', 'pessimistic-about', about).
noun_sg(woman, woman, fem).
noun_pl(women, woman, fem).
noun_sg('credit-card', 'credit-card', neutr).
noun_pl('credit-cards', 'credit-card', neutr).
noun_sg(month, month, neutr).
noun_pl(months, month, neutr).
noun_mass(water, water, neutr).
noun_mass(fear, fear, neutr).
noun_mass(money, money, neutr).
mn_sg(kg, kg).
mn_pl(kg, kg).
mn_sg(m, m).
mn_pl(m, m).
mn_sg('°C', '°C').
mn_pl('°C', '°C').
pn_sg('John', 'John', masc).
pn_sg('Nokia', 'Nokia', neutr).
pndef_sg('Nile', 'Nile', neutr).
pndef_pl('United-Nations', 'United-Nations', neutr).
pn_sg('Mona-Lisa', 'Mona-Lisa', neutr).
pndef_sg('Mona-Lisa', 'Mona-Lisa', neutr).
iv_finsg(waits, wait).
iv_infpl(wait, wait).
iv_finsg('goes-away', 'go-away').
iv_infpl('go-away', 'go-away').
iv_finsg(walks, walk).
iv_infpl(walk, walk).
tv_finsg(knows, know).
tv_infpl(know, know).
tv_pp(known, know).
tv_finsg(likes, like).
tv_infpl(like, like).
tv_pp(liked, like).
tv_finsg('relates-to', 'relate-to').
tv_infpl('relate-to', 'relate-to').
tv_pp('related-to', 'relate-to').
dv_finsg(shows, show, '').
dv_infpl(show, show, '').
dv_pp(shown, show, '').
dv_finsg(shows, show, to).
dv_infpl(show, show, to).
dv_pp(shown, show, to).
dv_finsg(forgives, forgive, '').
dv_infpl(forgive, forgive, '').
dv_pp(forgiven, forgive, '').
dv_finsg(succeeds, succeed, as).
dv_infpl(succeed, succeed, as).
dv_pp(succeeded, succeed, as).
prep(WordForm, LogicalSymbol).

0 comments on commit 03c9342

Please sign in to comment.