[Extract]opcode_process.py

import numpy as np
import pandas as pd
import subprocess
import os, sys
import time
from collections import defaultdict
from collections import Counter

from opcode_list import *

data_dir = "./TrainSet"
answer_file = "TrainSet_answer.xlsx"

file_list = os.listdir(data_dir)

opcodes = defaultdict(lambda: 0)
all_data = defaultdict(lambda: opcodes)
idf_data = defaultdict(lambda: 0)
all_opcodes = []

print("Opcode processing...")
start = time.time()
for count, i in enumerate(file_list):
    if count % 100 == 0:
        print(count)
    command = "objdump -d "+data_dir+"/" + i + "|grep '[0-9a-f]:'|grep -v 'file'|cut -f2 -d:|cut -f1-6 -d' '|tr -s ' '|tr '\t' ' '|sed 's/ $//g'|paste -d '' -s |sed 's/^/\"/'|sed 's/$/\"/g'"
    return_v = subprocess.check_output(command, shell=True).decode("utf-8")
    result = return_v.split()
    
    #for count opcodes
    opcodes = dict(Counter(result))
    for v in dict(Counter(result)).keys():
        if(len(v)<2 or len(v)>2):
            del opcodes[v]
            continue
        if not v in opcodes_list:
            del opcodes[v]
            continue
        #for IDF
        idf_data[v] += 1
    all_data[i] = opcodes
#     break
end = time.time()
print("Processing time: {}s".format(end-start))

start = time.time()
print("TF-IDF Processing")
#TF-IDF Processing
for index, i in enumerate(all_data):
    print(index)
    for j in all_data[i]:
        all_data[i][j] = round(all_data[i][j]/ idf_data[j], 7)
end = time.time()
print("Processing time: {}s".format(end-start))

opcodes_list.insert(0, "filename")
df_data = pd.DataFrame(columns=opcodes_list)

# temp_all_data = all_data
for index, i in enumerate(all_data):
    print(index)
    all_data[i]['filename'] = i
    df_data = df_data.append(all_data[i], ignore_index=True)

# label file process
# need to add column name
label_csv = pd.read_excel(answer_file)
label_csv.head()

# label + opcode data merging
temp = pd.merge(df_data, label_csv, on='filename')
temp.head()

temp.to_csv("train_data_tfidf.csv", index=False)