forked from youn0808/BlockchainDataAnalytics
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathget_features.py
146 lines (121 loc) · 4.86 KB
/
get_features.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
import os
import gc
import numpy as np
from csv import writer
year = 2015
intermediate_dir = "data/intermediate_data/"
month_days = [31, 28, 31, 30, 31, 30, 31, 31, 30, 31, 30, 31]
featureNames = ["address_hash", "price", "no_transaction", "no_neighbor", "avg_price_neighbors", "month", "day", "is_bad_address"]
month_dir = None
bad_times_dir = None
bad_prices_dir = None
tx_times_dir = None
addr_counts_dir = None
avg_prices_dir = None
addr_prices_dir = None
addr_tx_index_dir = None
addr_hashes_dir = None
# this method extract address features of a given day and write into a csv file
def getFeaturesOfDay(month, day, isBadAddrAvailable, csvWriter):
# initiazing intermediate data files
tx_times = np.load(tx_times_dir + "/tx_times_" + str(day) + ".npy")
addr_counts = np.load(addr_counts_dir + "/addr_counts_" + str(day) + ".npy")
avg_prices = np.load(avg_prices_dir + "/avg_prices_" + str(day) + ".npy")
addr_prices = np.load(addr_prices_dir + "/addr_prices_" + str(day) + ".npy")
addr_tx_index = np.load(addr_tx_index_dir + "/addr_tx_index_" + str(day) + ".npy")
addr_hashes = np.load(addr_hashes_dir + "/addr_hashes_" + str(day) + ".npy")
bad_times = None
bad_prices = None
# keeping track of which addresses are already added
added_addrs = [False] * len(addr_prices)
# check if bad addresses are available for the corresponding day
if isBadAddrAvailable:
# open numpy files lisitng bad address times and prices on the corresponding day
bad_times = np.load(bad_times_dir + "/bad_times_" + str(day) + ".npy")
bad_prices = np.load(bad_prices_dir + "/bad_prices_" + str(day) + ".npy")
for i in range(len(bad_prices)):
# convert darknet price into original price
price_in_darknet = int(round(float(bad_prices[i]) * 10 ** 8))
if price_in_darknet == 0:
continue
# check if the bad address price matches with address prices of the same day
indices = np.where(addr_prices == price_in_darknet)[0]
if len(indices) > 0:
# iterate over all price matches to find matches in timestamp
for j in indices:
index = addr_tx_index[j]
if added_addrs[j] == False: #tx_times[index] == bad_times[i]:
# There is a match. Write all features into csv file
features_row = []
features_row.append(addr_hashes[j])
features_row.append(addr_prices[j])
features_row.append(len(tx_times))
features_row.append(addr_counts[index] - 1)
features_row.append(avg_prices[index])
features_row.append(month)
features_row.append(day)
features_row.append(True)
csvWriter.writerow(features_row)
added_addrs[j] = True
# adding 1k good addresses
good_count = 0
for i in range(len(addr_prices)):
if int(addr_prices[i]) == 0 or added_addrs[i] == True:
continue
index = addr_tx_index[i]
features_row = []
features_row.append(addr_hashes[i])
features_row.append(addr_prices[i])
features_row.append(len(tx_times))
features_row.append(addr_counts[index] - 1)
features_row.append(avg_prices[index])
features_row.append(month)
features_row.append(day)
features_row.append(False)
csvWriter.writerow(features_row)
added_addrs[i] = True
good_count += 1
if good_count > 1000:
break
# clearing unnecessary data variables
tx_times = None
addr_counts = None
avg_prices = None
addr_prices = None
addr_tx_index = None
addr_hashes = None
bad_times = None
bad_prices = None
if __name__ == '__main__':
print("Extracting features...")
f_csv = open("data/output_features.csv", "a", encoding='UTF8', newline='')
csvWriter = writer(f_csv)
csvWriter.writerow(featureNames)
for i in range(12):
month = i + 1
# initializing data directories for the given month
month_dir = intermediate_dir + "month_" + str(month) + "/"
#initializing data directories of darknet data for the given month
bad_times_dir = month_dir + "bad_times/"
bad_prices_dir = month_dir + "bad_prices/"
#initializing data directories of network data for the given month
tx_times_dir = month_dir + "tx_times/"
addr_counts_dir = month_dir + "addr_counts/"
avg_prices_dir = month_dir + "avg_prices/"
addr_prices_dir = month_dir + "addr_prices/"
addr_tx_index_dir = month_dir + "addr_tx_index/"
addr_hashes_dir = month_dir + "addr_hashes/"
#getting bad file names under current month directory
fnames_bad_times = os.listdir(bad_times_dir)
fnames_bad_prices = os.listdir(bad_prices_dir)
for day in range(1, month_days[i] + 1):
if "bad_times_" + str(day) + ".npy" in fnames_bad_times:
isBadAddrAvailable = True
else:
isBadAddrAvailable = False
# calling appropriate function to extract features
getFeaturesOfDay(month, day, isBadAddrAvailable, csvWriter)
gc.collect()
#closing the CSV file after writing all features
f_csv.close()
print("Feature extraction finished.")