-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #12 from Amber-Williams/develop
Structured data using gpt-4o-mini & refactor
- Loading branch information
Showing
14 changed files
with
1,546 additions
and
1,270 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,2 +1,2 @@ | ||
OPENAI_API_KEY= | ||
OPENAI_MODEL=gpt-3.5-turbo-1106 | ||
OPENAI_MODEL=gpt-4o-mini |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file was deleted.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
File renamed without changes.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,58 @@ | ||
import json | ||
from typing import List | ||
from httpx import Timeout | ||
|
||
from pydantic import ValidationError | ||
from openai import OpenAI | ||
import tiktoken | ||
|
||
from models import JobPosting, JobPostings | ||
|
||
|
||
class DataExtractorModel: | ||
def __init__(self, model_key: str, model: str): | ||
self.valid_gpt_models = [ | ||
# Models must support structured data extraction | ||
"gpt-4o-mini", | ||
"gpt-4o", | ||
"gpt-4o-2024-08-06" | ||
] | ||
|
||
if isinstance(model, str): | ||
if model not in self.valid_gpt_models: | ||
raise ValueError( | ||
f"Invalid model. Available GPT models: {', '.join(_model for _model in self.valid_gpt_models)}" | ||
) | ||
self.model = model | ||
self.client = OpenAI(api_key=model_key) | ||
self.token_encoding = tiktoken.encoding_for_model(self.model) | ||
|
||
self.system_role_prompt = """ | ||
You are a data extraction expert that extracts job posting data based on a list of job postings you are provided. | ||
Important rules: | ||
- There can be many roles to one posting in these cases the comment_id is will be the same for the related roles. | ||
- Make sure if a job title has 'and' or '&' in it that you split the job title into more than one job postings.""" | ||
|
||
def get_token_estimate(self, content: str) -> int: | ||
return self.token_encoding.encode(content) | ||
|
||
def extract(self, content) -> List[JobPosting]: | ||
try: | ||
content_json = json.dumps(content) | ||
completion = self.client.beta.chat.completions.parse( | ||
model=self.model, | ||
response_format=JobPostings, | ||
messages=[ | ||
{"role": "system", "content": self.system_role_prompt}, | ||
{"role": "user", "content": content_json}, | ||
], | ||
temperature=0.0, | ||
timeout=Timeout(60), | ||
) | ||
return completion.choices[0].message.parsed.postings | ||
except ValidationError as e: | ||
print(f"Validation error: {e}") | ||
raise e | ||
except Exception as err: | ||
print(f"Unexpected {err=}, {type(err)=}") | ||
raise err |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,45 @@ | ||
import os | ||
from typing import List | ||
|
||
import pandas as pd | ||
|
||
from lib.csv_to_markdown import Csv2Markdown | ||
|
||
|
||
class DataFile: | ||
def __init__(self, write_dir: str, read_dir: str): | ||
self.write_dir = write_dir | ||
self.read_dir = read_dir | ||
|
||
if not os.path.exists(self.write_dir): | ||
os.makedirs(self.write_dir, exist_ok=True) | ||
|
||
def read_df(self, file_name: str): | ||
file_path = f"{self.read_dir}/{file_name}" | ||
if os.path.exists(file_path): | ||
return pd.read_csv(file_path, index_col=0) | ||
raise FileNotFoundError(f"File not found: {file_path}") | ||
|
||
def write_df(self, df: pd.DataFrame, file_name: str, partial: bool = False): | ||
if partial: | ||
if not os.path.exists(f"{self.write_dir}/batch"): | ||
os.makedirs(f"{self.write_dir}/batch", exist_ok=True) | ||
df.to_csv(f"{self.write_dir}/batch/{file_name}") | ||
else: | ||
df.to_csv(f"{self.write_dir}/{file_name}") | ||
|
||
def write_md_from_csv(self, csv_file_name: str, md_file_name: str): | ||
csv_file = f"{self.read_dir}/{csv_file_name.replace('.csv', '')}.csv" | ||
md_file = f"{self.write_dir}/{md_file_name.replace('.md', '')}.md" | ||
md = Csv2Markdown(filepath=csv_file) | ||
md.save_table(md_file) | ||
|
||
def join_partial_df(self, indices: List[int], file_name: str): | ||
batch_csvs = [] | ||
for index in indices: | ||
batch_df = self.read_df(file_name=f"batch/{index}.csv") | ||
batch_csvs.append(batch_df) | ||
df = pd.concat(batch_csvs) | ||
df = df.reset_index(drop=True) | ||
self.write_df(df=df, file_name=file_name) | ||
return df |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,62 @@ | ||
import urllib | ||
import requests | ||
from typing import List | ||
|
||
from bs4 import BeautifulSoup | ||
|
||
class NewsScrapper: | ||
def __init__(self, year: int, month: str): | ||
self.headers = { | ||
'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:109.0) Gecko/20100101 Firefox/117.0' | ||
} | ||
self.year = year | ||
self.month = month | ||
|
||
def _get_page_comments(self, soup, comment_list: List[dict]): | ||
comments = soup.find_all(class_="comtr") | ||
for comment_el in comments: | ||
comment = comment_el.find(class_="commtext") | ||
# Skip if comment was deleted | ||
if comment is None: | ||
continue | ||
# We only care about comments with a pipe character | ||
# because it is in the format outlined by HN's whoishiring | ||
if "|" in comment.text: | ||
comment_list.append({ | ||
"comment_text": comment.text, | ||
"comment_id": comment_el['id'] | ||
}) | ||
return comment_list | ||
|
||
def _get_next_page(self, soup, comment_list: List[dict]): | ||
if soup.find(class_='morelink'): | ||
next_url = soup.find(class_='morelink') | ||
next_url = next_url['href'] | ||
response = requests.get(f"https://news.ycombinator.com/{next_url}", | ||
headers=self.headers, | ||
timeout=30 | ||
) | ||
soup = BeautifulSoup(response.text, 'lxml') | ||
_comment_list = self._get_page_comments(soup, comment_list) | ||
return self._get_next_page(soup, _comment_list) | ||
else: | ||
return comment_list | ||
|
||
def get_hn_hiring_posts(self): | ||
# Get the first link from Google search results | ||
text = f":news.ycombinator.com who's hiring {self.month} {self.year}" | ||
text = urllib.parse.quote_plus(text) | ||
url = 'https://google.com/search?q=' + text | ||
response = requests.get(url, headers=self.headers) | ||
soup = BeautifulSoup(response.text, 'lxml') | ||
|
||
# Scrape the HN thread | ||
url = soup.find_all(class_='g')[0] | ||
url = url.find('a') | ||
url = url['href'] | ||
response = requests.get(url, headers=self.headers) | ||
soup = BeautifulSoup(response.text, 'lxml') | ||
comment_list = [] | ||
comment_list = self._get_page_comments(soup, comment_list) | ||
# Continue to scrape the HN thread - pages | ||
return comment_list |
Oops, something went wrong.