-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
12 changed files
with
508 additions
and
1 deletion.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,33 @@ | ||
name: CI | ||
|
||
on: | ||
push: | ||
branches: | ||
- main | ||
- 'feature/**' | ||
pull_request: | ||
branches: | ||
- main | ||
|
||
jobs: | ||
build: | ||
runs-on: ubuntu-latest | ||
|
||
steps: | ||
- name: Checkout code | ||
uses: actions/checkout@v2 | ||
|
||
- name: Set up Python | ||
uses: actions/setup-python@v2 | ||
with: | ||
python-version: '3.12' | ||
|
||
- name: Install dependencies | ||
run: | | ||
python -m pip install --upgrade pip | ||
pip install -r requirements.txt | ||
pip install pytest | ||
- name: Run tests | ||
run: | | ||
pytest tests --verbose |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,70 @@ | ||
STIX-D | ||
anaphorically | ||
STIX | ||
anaphoric | ||
coordinations | ||
Anaphoric | ||
Attempto | ||
uncompiled | ||
uncompiled | ||
Prolog | ||
Clex | ||
Ulex | ||
undef | ||
neutr | ||
acts-upon | ||
act-upon | ||
acts-with | ||
act-with | ||
ad-libs | ||
ad-lib | ||
adapts-to | ||
adapt-to | ||
adds-in | ||
add-in | ||
adds-on | ||
add-on | ||
adds-onto | ||
add-onto | ||
adds-to | ||
add-to | ||
adds-together | ||
add-together | ||
adds-up | ||
add-up | ||
adds-up-to | ||
add-up-to | ||
adheres-to | ||
adhere-to | ||
adjoins-with | ||
adjoin-with | ||
adjourns-from | ||
adjourn-from | ||
adjourns-to | ||
adjourn-to | ||
adjudicates-on | ||
adjudicate-on | ||
adjudicates-upon | ||
adjudicate-upon | ||
adjusts-for | ||
adjust-for | ||
act-out | ||
act-out-of | ||
act-through | ||
act-under | ||
act-upon | ||
act-with | ||
ad-lib | ||
adapt-to | ||
add-in | ||
add-on | ||
add-onto | ||
add-to | ||
add-together | ||
add-up | ||
add-up-to | ||
adhere-to | ||
adjoin-with | ||
adjourn-from | ||
adjourn-to | ||
adjudicate-on |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,4 @@ | ||
{"rule":"POSSESSIVE_APOSTROPHE","sentence":"^\\QThe special determiners nobody but , nothing but , and their variant no ... but are used with bare plural noun phrases, bare mass noun phrases or proper names.\\E$"} | ||
{"rule":"ENGLISH_WORD_REPEAT_RULE","sentence":"^\\Qconditional sentences (if-then sentences)\nlogical negation\nnegation as failure\nmodality\nsentence subordination\\E$"} | ||
{"rule":"WHO_NOUN","sentence":"^\\Qundef they, them, themselves their, their own that, whose\nneutr it, itself its, its own which\nhuman he/she, she/he, him/her, her/him, himself/herself, herself/himself his/her, her/his, his/her own, her/his own who\nmasc he, him, himself his, his own\nfem she, her, herself her, her own\\E$"} | ||
{"rule":"ADMIT_ENJOY_VB","sentence":"^\\QIt is also recommended to define a third singular form for each infinitive form, and vice versa.\\E$"} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,10 +1,99 @@ | ||
{ | ||
"cSpell.words": [ | ||
"act-out", | ||
"act-out-of", | ||
"act-through", | ||
"act-under", | ||
"act-upon", | ||
"act-with", | ||
"acts-upon", | ||
"acts-with", | ||
"ad-lib", | ||
"ad-libs", | ||
"adapt-to", | ||
"adapts-to", | ||
"add-in", | ||
"add-on", | ||
"add-onto", | ||
"add-to", | ||
"add-together", | ||
"add-up", | ||
"add-up-to", | ||
"adds-in", | ||
"adds-on", | ||
"adds-onto", | ||
"adds-to", | ||
"adds-together", | ||
"adds-up", | ||
"adds-up-to", | ||
"adhere-to", | ||
"adheres-to", | ||
"adjoin-with", | ||
"adjoins-with", | ||
"adjourn-from", | ||
"adjourn-to", | ||
"adjourns-from", | ||
"adjourns-to", | ||
"adjudicate-on", | ||
"adjudicate-upon", | ||
"adjudicates-on", | ||
"adjudicates-upon", | ||
"adjust-for", | ||
"adjusts-for", | ||
"alexic", | ||
"anapaestic", | ||
"antenuptial", | ||
"antimonopoly", | ||
"antitrade", | ||
"après", | ||
"avant", | ||
"bleu", | ||
"browed", | ||
"Clex", | ||
"colour", | ||
"falutin", | ||
"filmable", | ||
"filtertipped", | ||
"finsg", | ||
"flavourless", | ||
"footsure", | ||
"gemmed", | ||
"godfearing", | ||
"greeneyed", | ||
"gynaecological", | ||
"hardbacked", | ||
"hardcovered", | ||
"hardhitting", | ||
"hardpressed", | ||
"harum", | ||
"heavensent", | ||
"higgledy", | ||
"highflown", | ||
"highflying", | ||
"hoity", | ||
"honourable", | ||
"housetrained", | ||
"humourless", | ||
"icefree", | ||
"infpl", | ||
"Leonino", | ||
"Multiwords", | ||
"namby", | ||
"neutr", | ||
"niminy", | ||
"omni", | ||
"pamby", | ||
"piggledy", | ||
"piminy", | ||
"pndef", | ||
"prohib" | ||
"posteriori", | ||
"prohib", | ||
"rigeur", | ||
"scarum", | ||
"toity", | ||
"Ulex", | ||
"uncared", | ||
"uncompiled", | ||
"vires" | ||
] | ||
} |
Empty file.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,88 @@ | ||
#!/usr/bin/env python | ||
# -*- coding: utf-8 -*- | ||
""" | ||
Script to convert HTML content from URLs to Markdown, respecting robots meta tags. | ||
""" | ||
|
||
# Import Standard Library Modules | ||
import argparse | ||
import re | ||
import os | ||
|
||
# Import Third-Party Modules | ||
from bs4 import BeautifulSoup # Library for parsing HTML and XML documents | ||
import html2text # Convert HTML to Markdown | ||
import requests # HTTP library for Python | ||
|
||
# Fetch HTML content from the URL | ||
def fetch_html(url: str) -> str: | ||
response = requests.get(url) | ||
response.raise_for_status() | ||
return response.text | ||
|
||
# Extract the body and title from the HTML content | ||
def extract_body_and_title(html: str) -> tuple[str, str]: | ||
soup = BeautifulSoup(html, 'html.parser') | ||
body = soup.body | ||
title = soup.title.string if soup.title else 'untitled' | ||
return str(body), title | ||
|
||
# Convert HTML to Markdown | ||
def convert_html_to_markdown(html: str) -> str: | ||
converter = html2text.HTML2Text() | ||
converter.ignore_links = False | ||
return converter.handle(html) | ||
|
||
# Save Markdown text to a file | ||
def save_markdown(markdown_text: str, title: str, save_path: str) -> str: | ||
if not os.path.exists(save_path): | ||
os.makedirs(save_path) | ||
filename = f"{title}.md" | ||
filename = "".join(x for x in filename if x.isalnum() or x in "._- ") | ||
full_path = os.path.join(save_path, filename) | ||
with open(full_path, 'w', encoding='utf-8') as file: | ||
file.write(markdown_text) | ||
return full_path | ||
|
||
# Check if the site allows scraping by examining meta tags | ||
def allows_scraping(html: str) -> bool: | ||
soup = BeautifulSoup(html, 'html.parser') | ||
meta_tags = soup.find_all('meta', attrs={'name': re.compile(r'robots', re.IGNORECASE)}) | ||
for meta in meta_tags: | ||
if 'noindex' in meta.get('content', '').lower() or 'nofollow' in meta.get('content', '').lower(): | ||
return False | ||
return True | ||
|
||
# Process the given URL | ||
def process_url(url: str, save_path: str): | ||
try: | ||
html = fetch_html(url) | ||
if allows_scraping(html): | ||
body_html, title = extract_body_and_title(html) | ||
markdown_text = convert_html_to_markdown(body_html) | ||
filename = save_markdown(markdown_text, title, save_path) | ||
print(f"Markdown saved as {filename}") | ||
else: | ||
print(f"Skipping {url}: Scraping not allowed (robots meta tag).") | ||
except Exception as e: | ||
print(f"Failed to process {url}: {e}") | ||
|
||
# Main function to handle input path (URL or .txt file containing URLs) | ||
def main(input_path: str, save_path: str): | ||
if input_path.endswith('.txt'): | ||
with open(input_path, 'r', encoding='utf-8') as file: | ||
urls = file.readlines() | ||
urls = [url.strip() for url in urls] | ||
for url in urls: | ||
if url: | ||
process_url(url, save_path) | ||
else: | ||
process_url(input_path, save_path) | ||
|
||
# Entry point for the script | ||
if __name__ == "__main__": | ||
parser = argparse.ArgumentParser(description="Convert HTML content from URLs to Markdown, respecting robots meta tags.") | ||
parser.add_argument('input_path', nargs='?', default='docs/ace/ace_docs.txt', help="URL or path to a .txt file containing URLs") | ||
parser.add_argument('--save_path', default='docs/ace/', help="Path to save the Markdown files") | ||
args = parser.parse_args() | ||
main(args.input_path, args.save_path) |
Empty file.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,88 @@ | ||
#!/usr/bin/env python | ||
# -*- coding: utf-8 -*- | ||
|
||
from typing import Any, Dict, List, Optional, Set | ||
import mysql.connector | ||
from db.repository import AbstractRepository | ||
from model.lexical_entry import Lexeme | ||
|
||
class MySQLRepository(AbstractRepository): | ||
|
||
def __init__(self, connection_params): | ||
self.connection_params = connection_params | ||
|
||
def _connect(self): | ||
return mysql.connector.connect(**self.connection_params) | ||
|
||
def load_entries(self) -> List[Lexeme]: | ||
conn = self._connect() | ||
cursor = conn.cursor() | ||
cursor.execute("SELECT * FROM entries") | ||
rows = cursor.fetchall() | ||
conn.close() | ||
return [self._map_row_to_entry(row) for row in rows] | ||
|
||
def save_entry(self, entry: Lexeme) -> None: | ||
conn = self._connect() | ||
cursor = conn.cursor() | ||
cursor.execute( | ||
"INSERT INTO entries (base_form, pos, definition, synonyms, antonyms) VALUES (%s, %s, %s, %s, %s)", | ||
(entry.base_form, entry.pos, entry.definition, ",".join(entry.synonyms), ",".join(entry.antonyms)) | ||
) | ||
conn.commit() | ||
conn.close() | ||
|
||
def find_entry_by_id(self, entry_id: int) -> Optional[Lexeme]: | ||
conn = self._connect() | ||
cursor = conn.cursor() | ||
cursor.execute("SELECT * FROM entries WHERE id = %s", (entry_id,)) | ||
row = cursor.fetchone() | ||
conn.close() | ||
return self._map_row_to_entry(row) if row else None | ||
|
||
def find_stix_object_by_id(self, obj_id: str) -> Optional[Dict]: | ||
conn = self._connect() | ||
cursor = conn.cursor() | ||
cursor.execute("SELECT * FROM stix_objects WHERE obj_id = %s", (obj_id,)) | ||
row = cursor.fetchone() | ||
conn.close() | ||
return self._map_row_to_stix_object(row) if row else None | ||
|
||
def save_stix_object(self, stix_object: Dict) -> None: | ||
conn = self._connect() | ||
cursor = conn.cursor() | ||
cursor.execute( | ||
"INSERT INTO stix_objects (obj_id, type, created_by_ref, description, spec_version, created, modified, revoked, labels, confidence, lang, external_references, object_marking_refs, granular_markings, extensions, derived_from, duplicate_of, related_to) VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)", | ||
( | ||
stix_object['obj_id'], stix_object['type'], stix_object['created_by_ref'], stix_object['description'], | ||
stix_object['spec_version'], stix_object['created'], stix_object['modified'], stix_object['revoked'], | ||
stix_object['labels'], stix_object['confidence'], stix_object['lang'], stix_object['external_references'], | ||
stix_object['object_marking_refs'], stix_object['granular_markings'], stix_object['extensions'], | ||
stix_object['derived_from'], stix_object['duplicate_of'], stix_object['related_to'] | ||
) | ||
) | ||
conn.commit() | ||
conn.close() | ||
|
||
def _map_row_to_entry(self, row) -> Lexeme: | ||
if not row: | ||
return None | ||
id, base_form, pos, definition, synonyms, antonyms = row | ||
return Lexeme( | ||
base_form=base_form, | ||
pos=pos, | ||
definition=definition, | ||
synonyms=set(synonyms.split(",")), | ||
antonyms=set(antonyms.split(",")) | ||
) | ||
|
||
def _map_row_to_stix_object(self, row) -> Dict: | ||
if not row: | ||
return None | ||
return { | ||
'obj_id': row[0], 'type': row[1], 'created_by_ref': row[2], 'description': row[3], | ||
'spec_version': row[4], 'created': row[5], 'modified': row[6], 'revoked': row[7], | ||
'labels': row[8], 'confidence': row[9], 'lang': row[10], 'external_references': row[11], | ||
'object_marking_refs': row[12], 'granular_markings': row[13], 'extensions': row[14], | ||
'derived_from': row[15], 'duplicate_of': row[16], 'related_to': row[17] | ||
} |
Oops, something went wrong.