-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy path[Extract]opcode_process.py
74 lines (62 loc) · 1.98 KB
/
[Extract]opcode_process.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
import numpy as np
import pandas as pd
import subprocess
import os, sys
import time
from collections import defaultdict
from collections import Counter
from opcode_list import *
data_dir = "./TrainSet"
answer_file = "TrainSet_answer.xlsx"
file_list = os.listdir(data_dir)
opcodes = defaultdict(lambda: 0)
all_data = defaultdict(lambda: opcodes)
idf_data = defaultdict(lambda: 0)
all_opcodes = []
print("Opcode processing...")
start = time.time()
for count, i in enumerate(file_list):
if count % 100 == 0:
print(count)
command = "objdump -d "+data_dir+"/" + i + "|grep '[0-9a-f]:'|grep -v 'file'|cut -f2 -d:|cut -f1-6 -d' '|tr -s ' '|tr '\t' ' '|sed 's/ $//g'|paste -d '' -s |sed 's/^/\"/'|sed 's/$/\"/g'"
return_v = subprocess.check_output(command, shell=True).decode("utf-8")
result = return_v.split()
#for count opcodes
opcodes = dict(Counter(result))
for v in dict(Counter(result)).keys():
if(len(v)<2 or len(v)>2):
del opcodes[v]
continue
if not v in opcodes_list:
del opcodes[v]
continue
#for IDF
idf_data[v] += 1
all_data[i] = opcodes
# break
end = time.time()
print("Processing time: {}s".format(end-start))
start = time.time()
print("TF-IDF Processing")
#TF-IDF Processing
for index, i in enumerate(all_data):
print(index)
for j in all_data[i]:
all_data[i][j] = round(all_data[i][j]/ idf_data[j], 7)
end = time.time()
print("Processing time: {}s".format(end-start))
opcodes_list.insert(0, "filename")
df_data = pd.DataFrame(columns=opcodes_list)
# temp_all_data = all_data
for index, i in enumerate(all_data):
print(index)
all_data[i]['filename'] = i
df_data = df_data.append(all_data[i], ignore_index=True)
# label file process
# need to add column name
label_csv = pd.read_excel(answer_file)
label_csv.head()
# label + opcode data merging
temp = pd.merge(df_data, label_csv, on='filename')
temp.head()
temp.to_csv("train_data_tfidf.csv", index=False)