-
Notifications
You must be signed in to change notification settings - Fork 6
/
Copy pathngd.py
110 lines (95 loc) · 3.36 KB
/
ngd.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
import requests
from fake_useragent import UserAgent
from bs4 import BeautifulSoup
import math, sys, time, random, collections
import numpy as np
import pandas as pd
"""
A python script to calculate Normalized Google Distance
The Normalized Google Distance (NGD) is a semantic similarity measure,
calculated based on the number of hits returned by Google for a set of
keywords. If keywords have many pages in common relative to their respective,
independent frequencies, then these keywords are thought to be semantically
similar.
If two search terms w1 and w2 never occur together on the same web
page, but do occur separately, the NGD between them is infinite.
If both terms always (and only) occur together, their NGD is zero.
"""
def NGD(w1, w2):
"""
Returns the Normalized Google Distance between two queries.
Params:
w1 (str): word 1
w2 (str): word 2
Returns:
NGD (float)
"""
N = 25270000000.0 # Number of results for "the", proxy for total pages
N = math.log(N,2)
if w1 != w2:
f_w1 = math.log(number_of_results(w1),2)
f_w2 = math.log(number_of_results(w2),2)
f_w1_w2 = math.log(number_of_results(w1+" "+w2),2)
NGD = (max(f_w1,f_w2) - f_w1_w2) / (N - min(f_w1,f_w2))
return NGD
else:
return 0
def calculate_NGD(w1, w2, n_retries=10):
"""
Attempt to calculate NGD.
We will attempt to calculate NGD, trying `n_retries`. (Sometimes Google throws
captcha pages. But we will just wait and try again). Iff all attempts fail,
then we'll return NaN for this pairwise comparison.
Params:
w1 (str): word 1
w2 (str): word 2
retries (int): Number of attempts to retry before returning NaN
Returns:
if succesful:
returns NGD
if not succesful:
returns np.NaN
"""
for attempt in range(n_retries):
try:
return NGD(w1, w2)
except Exception as e:
print("Trying again...")
print(e)
else:
print("Sorry. We tried and failed. Returning NaN.")
return np.NaN
def pairwise_NGD(element_list, retries=10):
"""Compute pairwise NGD for a list of terms"""
distance_matrix = collections.defaultdict(dict) # init a nested dict
for i in element_list:
sleep(5, 10)
for j in element_list:
try: # See if we already calculated NGD(j, i)
print(i, j)
distance_matrix[i][j] = distance_matrix[j][i]
except KeyError: # If not, calculate NGD(i, j)
distance_matrix[i][j] = calculate_NGD(i, j, retries)
return distance_matrix
def pairwise_NGD_to_df(distances):
"""Returns a dataframe of pairwise NGD calculations"""
df_data = {}
for i in distances:
df_data[i] = [distances[i][j] for j in distances]
df = pd.DataFrame(df_data)
df.index = distances
return df
def number_of_results(text):
"""Returns the number of Google results for a given query."""
headers = {'User-Agent': UserAgent().firefox}
sleep(5, 10)
r = requests.get("https://www.google.com/search?q={}".format(text.replace(" ","+")), headers=headers)
soup = BeautifulSoup(r.text, "lxml") # Get text response
res = soup.find('div', {'id': 'result-stats'}) # Find result string
return int(res.text.replace(",", "").split()[1]) # Return result int
def sleep(alpha, beta):
"""Sleep for an amount of time in range(alpha, beta)"""
rand = random.Random()
time.sleep(rand.uniform(alpha, beta))
if __name__ == "__main__":
print("This is a script for calculating NGD.")