Skip to content

Commit

Permalink
++Added Cross-validation with random seeds
Browse files Browse the repository at this point in the history
  • Loading branch information
Habush committed May 18, 2018
1 parent 98313b9 commit 249a0d3
Show file tree
Hide file tree
Showing 2 changed files with 104 additions and 19 deletions.
38 changes: 19 additions & 19 deletions cross_val/moses_cross_val.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
import numpy as np
from sklearn.model_selection import train_test_split, StratifiedShuffleSplit
from pathlib import Path
from cross_val.random_seed import RandomSeed

default_list = "['-j', '6', '--balance', '1', '-m', '1000', '-W1', '1', '--output-cscore', '1', '--result-count', '100', '--reduct-knob-building-effort=1', '--hc-widen-search=1', '--enable-fs=1', '--fs-algo=smd', '--fs-target-size=4', '--hc-crossover-min-neighbors=5000', '--fs-focus=all', '--fs-seed=init', '--complexity-ratio=1', '--hc-fraction-of-nn=.3', '--hc-crossover-pop-size=1000']"

Expand Down Expand Up @@ -46,24 +47,12 @@ def run_moses(self):
return ret

def run_eval(self):

combo_program = self._format_combo(self.output)
temp_out = "eval_" + self.id
if combo_program:
cmd = "eval-table -i {0} -C {1} -o {2} -u{3}".format(self.test_file, self.output, temp_out, "case")
print(cmd)
ret = subprocess.Popen(args=cmd, shell=True).wait()

return ret

return -1

def _format_combo(self, input_file):
temp_combo = "temp_combo_" + self.id
cmd = " cut -d\" \" -f1 --complement " + input_file + " > " + "temp_combo_" + self.id + " && cat " + temp_combo + " > " + input_file
subprocess.Popen(args=cmd, shell=True).wait()
os.remove(temp_combo)
return input_file
cmd = "eval-table -i {0} -C {1} -o {2} -u{3}".format(self.test_file, self.output, temp_out, "case")
print(cmd)
ret = subprocess.Popen(args=cmd, shell=True).wait()
return ret

def build_matrix(self):
files = list(Path(".").glob("eval_" + self.id + "[0-9]*"))
Expand Down Expand Up @@ -119,15 +108,26 @@ def shuffle_split(self):
for train_index, test_index in self.cv.split(x, y):
x_train, x_test = x[train_index], x[test_index]

self.output = self.id + "_" + str(i)
i += 1
self.output = "{0}_fold_{1}".format(self.id, str(i))

self.test = y[test_index]

pd.DataFrame(x_train, columns=self.dataset.columns.values).to_csv(self.train_file, index=False)

pd.DataFrame(x_test, columns=self.dataset.columns.values).to_csv(self.test_file, index=False)

self.run()
randSeed = RandomSeed(self.train_file, self.id, i)

randSeed.run()
self.run_eval()

print("Successfully finished process!")

rec, pre, acc = self.score()

print("Recall: {0:.1f}\tPrecison:{1:.1f}\tAccuracy:{2:.1f}".format(rec, pre, acc))

i += 1

def run(self):

Expand Down
85 changes: 85 additions & 0 deletions cross_val/random_seed.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,85 @@
__author__ = 'Xabush Semrie'

import random
import subprocess
import os

default_list = "['-j', '6', '--balance', '1', '-m', '100', '-W1', '1', '--output-cscore', '1', '--result-count', '100', '--reduct-knob-building-effort=1', '--hc-widen-search=1', '--enable-fs=1', '--fs-algo=smd', '--fs-target-size=4', '--hc-crossover-min-neighbors=5000', '--fs-focus=all', '--fs-seed=init', '--complexity-ratio=1', '--hc-fraction-of-nn=.3', '--hc-crossover-pop-size=1000']"


class RandomSeed:
def __init__(self, train_file, id, fold, opts=None, num_rands=None):

if num_rands is None:
self.num_rands = 5
else:
self.num_rands = num_rands

self.rand_population = range(1, 1000)

self.random_seeds = random.sample(self.rand_population, self.num_rands)

self.id = id
self.train_file = train_file

if opts is None:
self.opts = default_list
else:
self.opts = opts

self.fold = str(fold)

self.models = []

self.files = []

def run(self):

for i in self.random_seeds:
self.output = "{0}_fold_{1}_seed_{2}".format(self.id, self.fold, str(i))

self.files.append(self.output)

self.run_moses(i)

self.output = self.format_combo(self.output)

self.top_models()

print("Run Fold: " + str(self.fold) + " Seed: " + str(i))

file_name = "{0}_fold_{1}".format(self.id, self.fold)

with open(file_name, 'w') as file:
for model in self.models:
file.write("{}".format(model))


for file in self.files:
os.remove(file)

def top_models(self):
N = 0
with open(self.output, 'r') as combo_file:
for line in combo_file:
self.models.append(line)

N += 1
if N == 30: break; # we will set the top M models here

def run_moses(self, seed):
opts = self.opts.translate(str.maketrans("", "", "[],"))
opts = "-i {0} -o {1} --random-seed={2} ".format(self.train_file, self.output, str(seed)) + opts
# print self.opts
cmd = "moses " + opts
print(cmd)
ret = subprocess.Popen(args=cmd, shell=True).wait()
print(ret)
return ret

def format_combo(self, input_file):
temp_combo = "temp_combo_" + self.id
cmd = " cut -d\" \" -f1 --complement " + input_file + " > " + "temp_combo_" + self.id + " && cat " + temp_combo + " > " + input_file
subprocess.Popen(args=cmd, shell=True).wait()
os.remove(temp_combo)
return input_file

0 comments on commit 249a0d3

Please sign in to comment.