Skip to content

Commit

Permalink
Merge pull request #12 from Amber-Williams/develop
Browse files Browse the repository at this point in the history
Structured data using gpt-4o-mini & refactor
  • Loading branch information
Amber-Williams authored Aug 11, 2024
2 parents 6822442 + 497be1a commit e0bd30f
Show file tree
Hide file tree
Showing 14 changed files with 1,546 additions and 1,270 deletions.
2 changes: 1 addition & 1 deletion .env.example
Original file line number Diff line number Diff line change
@@ -1,2 +1,2 @@
OPENAI_API_KEY=
OPENAI_MODEL=gpt-3.5-turbo-1106
OPENAI_MODEL=gpt-4o-mini
7 changes: 6 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,6 @@ dist/
downloads/
eggs/
.eggs/
lib/
lib64/
parts/
sdist/
Expand Down Expand Up @@ -165,3 +164,9 @@ cython_debug/

# local output files
output/**

# Local TODO file
TODO

# Local debug files
.vscode/
182 changes: 0 additions & 182 deletions chat_extractor.py

This file was deleted.

4 changes: 3 additions & 1 deletion config.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,13 +10,15 @@

class Settings(BaseSettings):
OPENAI_API_KEY: str = os.getenv('OPENAI_API_KEY')
OPENAI_MODEL: str = os.getenv('OPENAI_MODEL') or "gpt-3.5-turbo-1106"
OPENAI_MODEL: str = os.getenv('OPENAI_MODEL')
MONTH: str = os.getenv('MONTH') or datetime.now().strftime('%B')
YEAR: str = os.getenv('YEAR') or datetime.now().strftime('%Y')
TOKEN_LIMIT: int = os.getenv('TOKEN_LIMIT') or 4096

if not OPENAI_API_KEY:
raise ValueError("OPENAI_API_KEY not found in .env file")
if not OPENAI_MODEL:
raise ValueError("OPENAI_MODEL not found in .env file")
if not MONTH:
raise ValueError("MONTH not found in .env file")
if not YEAR:
Expand Down
File renamed without changes.
58 changes: 58 additions & 0 deletions lib/data_extractor_model.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
import json
from typing import List
from httpx import Timeout

from pydantic import ValidationError
from openai import OpenAI
import tiktoken

from models import JobPosting, JobPostings


class DataExtractorModel:
def __init__(self, model_key: str, model: str):
self.valid_gpt_models = [
# Models must support structured data extraction
"gpt-4o-mini",
"gpt-4o",
"gpt-4o-2024-08-06"
]

if isinstance(model, str):
if model not in self.valid_gpt_models:
raise ValueError(
f"Invalid model. Available GPT models: {', '.join(_model for _model in self.valid_gpt_models)}"
)
self.model = model
self.client = OpenAI(api_key=model_key)
self.token_encoding = tiktoken.encoding_for_model(self.model)

self.system_role_prompt = """
You are a data extraction expert that extracts job posting data based on a list of job postings you are provided.
Important rules:
- There can be many roles to one posting in these cases the comment_id is will be the same for the related roles.
- Make sure if a job title has 'and' or '&' in it that you split the job title into more than one job postings."""

def get_token_estimate(self, content: str) -> int:
return self.token_encoding.encode(content)

def extract(self, content) -> List[JobPosting]:
try:
content_json = json.dumps(content)
completion = self.client.beta.chat.completions.parse(
model=self.model,
response_format=JobPostings,
messages=[
{"role": "system", "content": self.system_role_prompt},
{"role": "user", "content": content_json},
],
temperature=0.0,
timeout=Timeout(60),
)
return completion.choices[0].message.parsed.postings
except ValidationError as e:
print(f"Validation error: {e}")
raise e
except Exception as err:
print(f"Unexpected {err=}, {type(err)=}")
raise err
45 changes: 45 additions & 0 deletions lib/data_file.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
import os
from typing import List

import pandas as pd

from lib.csv_to_markdown import Csv2Markdown


class DataFile:
def __init__(self, write_dir: str, read_dir: str):
self.write_dir = write_dir
self.read_dir = read_dir

if not os.path.exists(self.write_dir):
os.makedirs(self.write_dir, exist_ok=True)

def read_df(self, file_name: str):
file_path = f"{self.read_dir}/{file_name}"
if os.path.exists(file_path):
return pd.read_csv(file_path, index_col=0)
raise FileNotFoundError(f"File not found: {file_path}")

def write_df(self, df: pd.DataFrame, file_name: str, partial: bool = False):
if partial:
if not os.path.exists(f"{self.write_dir}/batch"):
os.makedirs(f"{self.write_dir}/batch", exist_ok=True)
df.to_csv(f"{self.write_dir}/batch/{file_name}")
else:
df.to_csv(f"{self.write_dir}/{file_name}")

def write_md_from_csv(self, csv_file_name: str, md_file_name: str):
csv_file = f"{self.read_dir}/{csv_file_name.replace('.csv', '')}.csv"
md_file = f"{self.write_dir}/{md_file_name.replace('.md', '')}.md"
md = Csv2Markdown(filepath=csv_file)
md.save_table(md_file)

def join_partial_df(self, indices: List[int], file_name: str):
batch_csvs = []
for index in indices:
batch_df = self.read_df(file_name=f"batch/{index}.csv")
batch_csvs.append(batch_df)
df = pd.concat(batch_csvs)
df = df.reset_index(drop=True)
self.write_df(df=df, file_name=file_name)
return df
62 changes: 62 additions & 0 deletions lib/news_scrapper.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
import urllib
import requests
from typing import List

from bs4 import BeautifulSoup

class NewsScrapper:
def __init__(self, year: int, month: str):
self.headers = {
'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:109.0) Gecko/20100101 Firefox/117.0'
}
self.year = year
self.month = month

def _get_page_comments(self, soup, comment_list: List[dict]):
comments = soup.find_all(class_="comtr")
for comment_el in comments:
comment = comment_el.find(class_="commtext")
# Skip if comment was deleted
if comment is None:
continue
# We only care about comments with a pipe character
# because it is in the format outlined by HN's whoishiring
if "|" in comment.text:
comment_list.append({
"comment_text": comment.text,
"comment_id": comment_el['id']
})
return comment_list

def _get_next_page(self, soup, comment_list: List[dict]):
if soup.find(class_='morelink'):
next_url = soup.find(class_='morelink')
next_url = next_url['href']
response = requests.get(f"https://news.ycombinator.com/{next_url}",
headers=self.headers,
timeout=30
)
soup = BeautifulSoup(response.text, 'lxml')
_comment_list = self._get_page_comments(soup, comment_list)
return self._get_next_page(soup, _comment_list)
else:
return comment_list

def get_hn_hiring_posts(self):
# Get the first link from Google search results
text = f":news.ycombinator.com who's hiring {self.month} {self.year}"
text = urllib.parse.quote_plus(text)
url = 'https://google.com/search?q=' + text
response = requests.get(url, headers=self.headers)
soup = BeautifulSoup(response.text, 'lxml')

# Scrape the HN thread
url = soup.find_all(class_='g')[0]
url = url.find('a')
url = url['href']
response = requests.get(url, headers=self.headers)
soup = BeautifulSoup(response.text, 'lxml')
comment_list = []
comment_list = self._get_page_comments(soup, comment_list)
# Continue to scrape the HN thread - pages
return comment_list
Loading

0 comments on commit e0bd30f

Please sign in to comment.