Skip to content

Commit

Permalink
3k lobstr gnews daily from gdoc
Browse files Browse the repository at this point in the history
  • Loading branch information
dcolinmorgan committed Mar 26, 2024
1 parent 2613afb commit 7027d24
Show file tree
Hide file tree
Showing 4 changed files with 172 additions and 37 deletions.
165 changes: 146 additions & 19 deletions DOTS/dots_feat.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -1463,7 +1463,7 @@
},
{
"cell_type": "code",
"execution_count": 24,
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
Expand All @@ -1475,21 +1475,11 @@
},
{
"cell_type": "code",
"execution_count": 51,
"execution_count": 3,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"device(type='cpu')"
]
},
"execution_count": 51,
"metadata": {},
"output_type": "execute_result"
}
],
"outputs": [],
"source": [
"import torch\n",
"device = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\")"
]
},
Expand Down Expand Up @@ -1558,7 +1548,7 @@
"name": "stderr",
"output_type": "stream",
"text": [
"featurizing articles: 99%|█████████▊| 2646/2682 [1:26:53<01:06, 1.84s/it]"
"featurizing articles: 100%|██████████| 2682/2682 [1:28:12<00:00, 1.97s/it]\n"
]
}
],
Expand All @@ -1577,22 +1567,159 @@
},
{
"cell_type": "code",
"execution_count": 52,
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": 59,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>0</th>\n",
" <th>1</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>published_at \\\n",
"0 2024-02-...</td>\n",
" <td>None</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>published_at \\\n",
"0 2024-02-...</td>\n",
" <td>[huge stresses, disasters, aspects]</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>published_at \\\n",
"0 2024-02-...</td>\n",
" <td>None</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>published_at \\\n",
"0 2024-02-...</td>\n",
" <td>[surveillance, candidates, surveillance towers]</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>published_at \\\n",
"0 2024-02-...</td>\n",
" <td>[judgement, mercedes, catastrophe]</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2677</th>\n",
" <td>published_at \\\n",
"0 2024-02-...</td>\n",
" <td>[temperatures, widespread rains, meteorologist]</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2678</th>\n",
" <td>published_at \\\n",
"0 2024-02-...</td>\n",
" <td>[afternoon, thunderstorms, severe thunderstorms]</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2679</th>\n",
" <td>published_at \\\n",
"0 2024-02-...</td>\n",
" <td>[snowfall, morning, winter weather]</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2680</th>\n",
" <td>published_at \\\n",
"0 2024-02-...</td>\n",
" <td>[meteorologist, heavy rain, gusty winds]</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2681</th>\n",
" <td>published_at \\\n",
"0 2024-02-...</td>\n",
" <td>[breezy conditions, dry weather, stretch]</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>2682 rows × 2 columns</p>\n",
"</div>"
],
"text/plain": [
"['cloudy skies', 'dry weather', 'breezy conditions']"
" 0 \\\n",
"0 published_at \\\n",
"0 2024-02-... \n",
"1 published_at \\\n",
"0 2024-02-... \n",
"2 published_at \\\n",
"0 2024-02-... \n",
"3 published_at \\\n",
"0 2024-02-... \n",
"4 published_at \\\n",
"0 2024-02-... \n",
"... ... \n",
"2677 published_at \\\n",
"0 2024-02-... \n",
"2678 published_at \\\n",
"0 2024-02-... \n",
"2679 published_at \\\n",
"0 2024-02-... \n",
"2680 published_at \\\n",
"0 2024-02-... \n",
"2681 published_at \\\n",
"0 2024-02-... \n",
"\n",
" 1 \n",
"0 None \n",
"1 [huge stresses, disasters, aspects] \n",
"2 None \n",
"3 [surveillance, candidates, surveillance towers] \n",
"4 [judgement, mercedes, catastrophe] \n",
"... ... \n",
"2677 [temperatures, widespread rains, meteorologist] \n",
"2678 [afternoon, thunderstorms, severe thunderstorms] \n",
"2679 [snowfall, morning, winter weather] \n",
"2680 [meteorologist, heavy rain, gusty winds] \n",
"2681 [breezy conditions, dry weather, stretch] \n",
"\n",
"[2682 rows x 2 columns]"
]
},
"execution_count": 52,
"execution_count": 59,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"featurize_stories(str(i), top_k = 3, max_len=512)"
"pd.DataFrame(rank_articles)"
]
},
{
Expand Down
13 changes: 10 additions & 3 deletions DOTS/feat.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,7 @@ def chunk_text(text, max_len):

def featurize_stories(text, top_k, max_len):
# Extract candidate words/phrases
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
count = CountVectorizer(ngram_range=n_gram_range, stop_words=stop_words).fit([text])
all_candidates = count.get_feature_names_out()
doc = nlp(text)
Expand All @@ -77,17 +78,23 @@ def featurize_stories(text, top_k, max_len):
all_nouns = nouns.union(noun_phrases)
candidates = list(filter(lambda candidate: candidate in all_nouns, all_candidates))
candidate_tokens = tokenizer(candidates, padding=True, return_tensors="pt")
# if device == 'cuda':

candidate_tokens = {k: v.to(device) for k, v in (candidate_tokens).items()}
candidate_embeddings = model(**candidate_tokens)["pooler_output"]
candidate_embeddings = candidate_embeddings.detach() # .to_numpy
if device == 'cuda':
candidate_embeddings = candidate_embeddings.detach().to_numpy
else:
candidate_embeddings = candidate_embeddings.detach()
chunks = chunk_text(text, max_len) # use this to chunk better and use less padding thus less memory but also less affect from averging
for chunk in chunks:
text_tokens = tokenizer(chunk, padding=True, return_tensors="pt")
# if device == 'cuda':
text_tokens = {k: v.to(device) for k, v in (text_tokens).items()}
text_embedding = model(**text_tokens)["pooler_output"]
text_embedding = text_embedding.detach()#.to_numpy()
if device == 'cuda':
text_embedding = text_embedding.detach().to_numpy()
else:
text_embedding = text_embedding.detach()
embeddings.append(text_embedding)
max_emb_shape = max(embedding.shape[0] for embedding in embeddings)
padded_embeddings = [np.pad(embedding.cpu(), ((0, max_emb_shape - embedding.shape[0]), (0, 0))) for embedding in embeddings]
Expand Down
29 changes: 15 additions & 14 deletions DOTS/scrape.py
Original file line number Diff line number Diff line change
Expand Up @@ -108,18 +108,19 @@ def get_npr_news(p):
return full_stories


def pull_lobstr(page=1, limit=100):
values = {
"cluster": "65b9eea6e1cc6bb9f0cd2a47751a186f"
}
headers = {
'Content-Type': 'application/json',
'Authorization': str({lobstr_key})
}
url = f'https://api.lobstr.io/v1/runs?page={page}&limit={limit}'
responseA = requests.get(url, headers=headers, params=values)

url = f'https://api.lobstr.io/v1/runs/{responseA.json()['run_id']}/download'
response = requests.get(url, headers=headers)
# def pull_lobstr(page=1, limit=100):
# values = {
# "cluster": "65b9eea6e1cc6bb9f0cd2a47751a186f"
# }
# headers = {
# 'Content-Type': 'application/json',
# 'Authorization': str({lobstr_key})
# }
# url = f'https://api.lobstr.io/v1/runs?page={page}&limit={limit}'
# responseA = requests.get(url, headers=headers, params=values)


# url = f'https://api.lobstr.io/v1/runs/{responseA.json()['run_id']}/download'
# response = requests.get(url, headers=headers)

return response.json()
# return response.json()
2 changes: 1 addition & 1 deletion DOTS/test/test_dots_feat.py
Original file line number Diff line number Diff line change
Expand Up @@ -79,7 +79,7 @@ def test_gnews_featurize():
def get_lobstr_data():
return pull_lobstr_gdoc()

def test_lobstry_data(get_lobstr_data):
def test_lobstr_data(get_lobstr_data):
articles = get_lobstr_data
assert len(articles) > 2500

Expand Down

0 comments on commit 7027d24

Please sign in to comment.