Skip to content

Commit

Permalink
Added docscrapper.py
Browse files Browse the repository at this point in the history
  • Loading branch information
ciioprof0 committed Aug 5, 2024
1 parent 268d499 commit 8406d53
Show file tree
Hide file tree
Showing 12 changed files with 508 additions and 1 deletion.
33 changes: 33 additions & 0 deletions .github/workflows/ci.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
name: CI

on:
push:
branches:
- main
- 'feature/**'
pull_request:
branches:
- main

jobs:
build:
runs-on: ubuntu-latest

steps:
- name: Checkout code
uses: actions/checkout@v2

- name: Set up Python
uses: actions/setup-python@v2
with:
python-version: '3.12'

- name: Install dependencies
run: |
python -m pip install --upgrade pip
pip install -r requirements.txt
pip install pytest
- name: Run tests
run: |
pytest tests --verbose
70 changes: 70 additions & 0 deletions .vscode/ltex.dictionary.en-US.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,70 @@
STIX-D
anaphorically
STIX
anaphoric
coordinations
Anaphoric
Attempto
uncompiled
uncompiled
Prolog
Clex
Ulex
undef
neutr
acts-upon
act-upon
acts-with
act-with
ad-libs
ad-lib
adapts-to
adapt-to
adds-in
add-in
adds-on
add-on
adds-onto
add-onto
adds-to
add-to
adds-together
add-together
adds-up
add-up
adds-up-to
add-up-to
adheres-to
adhere-to
adjoins-with
adjoin-with
adjourns-from
adjourn-from
adjourns-to
adjourn-to
adjudicates-on
adjudicate-on
adjudicates-upon
adjudicate-upon
adjusts-for
adjust-for
act-out
act-out-of
act-through
act-under
act-upon
act-with
ad-lib
adapt-to
add-in
add-on
add-onto
add-to
add-together
add-up
add-up-to
adhere-to
adjoin-with
adjourn-from
adjourn-to
adjudicate-on
4 changes: 4 additions & 0 deletions .vscode/ltex.hiddenFalsePositives.en-US.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
{"rule":"POSSESSIVE_APOSTROPHE","sentence":"^\\QThe special determiners nobody but , nothing but , and their variant no ... but are used with bare plural noun phrases, bare mass noun phrases or proper names.\\E$"}
{"rule":"ENGLISH_WORD_REPEAT_RULE","sentence":"^\\Qconditional sentences (if-then sentences)\nlogical negation\nnegation as failure\nmodality\nsentence subordination\\E$"}
{"rule":"WHO_NOUN","sentence":"^\\Qundef they, them, themselves their, their own that, whose\nneutr it, itself its, its own which\nhuman he/she, she/he, him/her, her/him, himself/herself, herself/himself his/her, her/his, his/her own, her/his own who\nmasc he, him, himself his, his own\nfem she, her, herself her, her own\\E$"}
{"rule":"ADMIT_ENJOY_VB","sentence":"^\\QIt is also recommended to define a third singular form for each infinitive form, and vice versa.\\E$"}
91 changes: 90 additions & 1 deletion .vscode/settings.json
Original file line number Diff line number Diff line change
@@ -1,10 +1,99 @@
{
"cSpell.words": [
"act-out",
"act-out-of",
"act-through",
"act-under",
"act-upon",
"act-with",
"acts-upon",
"acts-with",
"ad-lib",
"ad-libs",
"adapt-to",
"adapts-to",
"add-in",
"add-on",
"add-onto",
"add-to",
"add-together",
"add-up",
"add-up-to",
"adds-in",
"adds-on",
"adds-onto",
"adds-to",
"adds-together",
"adds-up",
"adds-up-to",
"adhere-to",
"adheres-to",
"adjoin-with",
"adjoins-with",
"adjourn-from",
"adjourn-to",
"adjourns-from",
"adjourns-to",
"adjudicate-on",
"adjudicate-upon",
"adjudicates-on",
"adjudicates-upon",
"adjust-for",
"adjusts-for",
"alexic",
"anapaestic",
"antenuptial",
"antimonopoly",
"antitrade",
"après",
"avant",
"bleu",
"browed",
"Clex",
"colour",
"falutin",
"filmable",
"filtertipped",
"finsg",
"flavourless",
"footsure",
"gemmed",
"godfearing",
"greeneyed",
"gynaecological",
"hardbacked",
"hardcovered",
"hardhitting",
"hardpressed",
"harum",
"heavensent",
"higgledy",
"highflown",
"highflying",
"hoity",
"honourable",
"housetrained",
"humourless",
"icefree",
"infpl",
"Leonino",
"Multiwords",
"namby",
"neutr",
"niminy",
"omni",
"pamby",
"piggledy",
"piminy",
"pndef",
"prohib"
"posteriori",
"prohib",
"rigeur",
"scarum",
"toity",
"Ulex",
"uncared",
"uncompiled",
"vires"
]
}
Empty file added 508/app/__init__.py
Empty file.
88 changes: 88 additions & 0 deletions 508/app/doc_scrapper.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,88 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""
Script to convert HTML content from URLs to Markdown, respecting robots meta tags.
"""

# Import Standard Library Modules
import argparse
import re
import os

# Import Third-Party Modules
from bs4 import BeautifulSoup # Library for parsing HTML and XML documents
import html2text # Convert HTML to Markdown
import requests # HTTP library for Python

# Fetch HTML content from the URL
def fetch_html(url: str) -> str:
response = requests.get(url)
response.raise_for_status()
return response.text

# Extract the body and title from the HTML content
def extract_body_and_title(html: str) -> tuple[str, str]:
soup = BeautifulSoup(html, 'html.parser')
body = soup.body
title = soup.title.string if soup.title else 'untitled'
return str(body), title

# Convert HTML to Markdown
def convert_html_to_markdown(html: str) -> str:
converter = html2text.HTML2Text()
converter.ignore_links = False
return converter.handle(html)

# Save Markdown text to a file
def save_markdown(markdown_text: str, title: str, save_path: str) -> str:
if not os.path.exists(save_path):
os.makedirs(save_path)
filename = f"{title}.md"
filename = "".join(x for x in filename if x.isalnum() or x in "._- ")
full_path = os.path.join(save_path, filename)
with open(full_path, 'w', encoding='utf-8') as file:
file.write(markdown_text)
return full_path

# Check if the site allows scraping by examining meta tags
def allows_scraping(html: str) -> bool:
soup = BeautifulSoup(html, 'html.parser')
meta_tags = soup.find_all('meta', attrs={'name': re.compile(r'robots', re.IGNORECASE)})
for meta in meta_tags:
if 'noindex' in meta.get('content', '').lower() or 'nofollow' in meta.get('content', '').lower():
return False
return True

# Process the given URL
def process_url(url: str, save_path: str):
try:
html = fetch_html(url)
if allows_scraping(html):
body_html, title = extract_body_and_title(html)
markdown_text = convert_html_to_markdown(body_html)
filename = save_markdown(markdown_text, title, save_path)
print(f"Markdown saved as {filename}")
else:
print(f"Skipping {url}: Scraping not allowed (robots meta tag).")
except Exception as e:
print(f"Failed to process {url}: {e}")

# Main function to handle input path (URL or .txt file containing URLs)
def main(input_path: str, save_path: str):
if input_path.endswith('.txt'):
with open(input_path, 'r', encoding='utf-8') as file:
urls = file.readlines()
urls = [url.strip() for url in urls]
for url in urls:
if url:
process_url(url, save_path)
else:
process_url(input_path, save_path)

# Entry point for the script
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Convert HTML content from URLs to Markdown, respecting robots meta tags.")
parser.add_argument('input_path', nargs='?', default='docs/ace/ace_docs.txt', help="URL or path to a .txt file containing URLs")
parser.add_argument('--save_path', default='docs/ace/', help="Path to save the Markdown files")
args = parser.parse_args()
main(args.input_path, args.save_path)
Empty file added 508/db/__init__.py
Empty file.
88 changes: 88 additions & 0 deletions 508/db/mysql_repository.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,88 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-

from typing import Any, Dict, List, Optional, Set
import mysql.connector
from db.repository import AbstractRepository
from model.lexical_entry import Lexeme

class MySQLRepository(AbstractRepository):

def __init__(self, connection_params):
self.connection_params = connection_params

def _connect(self):
return mysql.connector.connect(**self.connection_params)

def load_entries(self) -> List[Lexeme]:
conn = self._connect()
cursor = conn.cursor()
cursor.execute("SELECT * FROM entries")
rows = cursor.fetchall()
conn.close()
return [self._map_row_to_entry(row) for row in rows]

def save_entry(self, entry: Lexeme) -> None:
conn = self._connect()
cursor = conn.cursor()
cursor.execute(
"INSERT INTO entries (base_form, pos, definition, synonyms, antonyms) VALUES (%s, %s, %s, %s, %s)",
(entry.base_form, entry.pos, entry.definition, ",".join(entry.synonyms), ",".join(entry.antonyms))
)
conn.commit()
conn.close()

def find_entry_by_id(self, entry_id: int) -> Optional[Lexeme]:
conn = self._connect()
cursor = conn.cursor()
cursor.execute("SELECT * FROM entries WHERE id = %s", (entry_id,))
row = cursor.fetchone()
conn.close()
return self._map_row_to_entry(row) if row else None

def find_stix_object_by_id(self, obj_id: str) -> Optional[Dict]:
conn = self._connect()
cursor = conn.cursor()
cursor.execute("SELECT * FROM stix_objects WHERE obj_id = %s", (obj_id,))
row = cursor.fetchone()
conn.close()
return self._map_row_to_stix_object(row) if row else None

def save_stix_object(self, stix_object: Dict) -> None:
conn = self._connect()
cursor = conn.cursor()
cursor.execute(
"INSERT INTO stix_objects (obj_id, type, created_by_ref, description, spec_version, created, modified, revoked, labels, confidence, lang, external_references, object_marking_refs, granular_markings, extensions, derived_from, duplicate_of, related_to) VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)",
(
stix_object['obj_id'], stix_object['type'], stix_object['created_by_ref'], stix_object['description'],
stix_object['spec_version'], stix_object['created'], stix_object['modified'], stix_object['revoked'],
stix_object['labels'], stix_object['confidence'], stix_object['lang'], stix_object['external_references'],
stix_object['object_marking_refs'], stix_object['granular_markings'], stix_object['extensions'],
stix_object['derived_from'], stix_object['duplicate_of'], stix_object['related_to']
)
)
conn.commit()
conn.close()

def _map_row_to_entry(self, row) -> Lexeme:
if not row:
return None
id, base_form, pos, definition, synonyms, antonyms = row
return Lexeme(
base_form=base_form,
pos=pos,
definition=definition,
synonyms=set(synonyms.split(",")),
antonyms=set(antonyms.split(","))
)

def _map_row_to_stix_object(self, row) -> Dict:
if not row:
return None
return {
'obj_id': row[0], 'type': row[1], 'created_by_ref': row[2], 'description': row[3],
'spec_version': row[4], 'created': row[5], 'modified': row[6], 'revoked': row[7],
'labels': row[8], 'confidence': row[9], 'lang': row[10], 'external_references': row[11],
'object_marking_refs': row[12], 'granular_markings': row[13], 'extensions': row[14],
'derived_from': row[15], 'duplicate_of': row[16], 'related_to': row[17]
}
Loading

0 comments on commit 8406d53

Please sign in to comment.