diff --git a/FB15k-237.tar.gz b/FB15k-237.tar.gz new file mode 100644 index 0000000..d87987f Binary files /dev/null and b/FB15k-237.tar.gz differ diff --git a/YAGO3-10/yago3_mte10-train.tsv.gz b/YAGO3-10.tar.gz similarity index 67% rename from YAGO3-10/yago3_mte10-train.tsv.gz rename to YAGO3-10.tar.gz index f9312d2..be63d16 100644 Binary files a/YAGO3-10/yago3_mte10-train.tsv.gz and b/YAGO3-10.tar.gz differ diff --git a/YAGO3-10/yago3_mte10-test.tsv.gz b/YAGO3-10/yago3_mte10-test.tsv.gz deleted file mode 100644 index d075d9b..0000000 Binary files a/YAGO3-10/yago3_mte10-test.tsv.gz and /dev/null differ diff --git a/YAGO3-10/yago3_mte10-valid.tsv.gz b/YAGO3-10/yago3_mte10-valid.tsv.gz deleted file mode 100644 index 4046f47..0000000 Binary files a/YAGO3-10/yago3_mte10-valid.tsv.gz and /dev/null differ diff --git a/main.py b/main.py new file mode 100644 index 0000000..58e5feb --- /dev/null +++ b/main.py @@ -0,0 +1,166 @@ +import json +import torch +import pickle +import numpy as np +import argparse +import sys +import os +import math + +from os.path import join +import torch.backends.cudnn as cudnn + +from training import ranking_and_hits +from model import ConvE, DistMult, Complex + +from spodernet.preprocessing.pipeline import Pipeline, DatasetStreamer +from spodernet.preprocessing.processors import JsonLoaderProcessors, Tokenizer, AddToVocab, SaveLengthsToState, StreamToHDF5, SaveMaxLengthsToState, CustomTokenizer +from spodernet.preprocessing.processors import ConvertTokenToIdx, ApplyFunction, ToLower, DictKey2ListMapper, ApplyFunction, StreamToBatch +from spodernet.utils.global_config import Config, Backends +from spodernet.utils.logger import Logger, LogLevel +from spodernet.preprocessing.batching import StreamBatcher +from spodernet.preprocessing.pipeline import Pipeline +from spodernet.preprocessing.processors import TargetIdx2MultiTarget +from spodernet.hooks import LossHook, ETAHook +from spodernet.utils.util import Timer +from spodernet.utils.cuda_utils import CUDATimer +from spodernet.utils.cuda_utils import CUDATimer +from spodernet.preprocessing.processors import TargetIdx2MultiTarget +np.set_printoptions(precision=3) + +timer = CUDATimer() +cudnn.benchmark = True + +# parse console parameters and set global variables +Config.backend = Backends.TORCH +Config.parse_argv(sys.argv) + +Config.cuda = True + +Config.hidden_size = 1 +Config.embedding_dim = 200 +#Logger.GLOBAL_LOG_LEVEL = LogLevel.DEBUG + + +model_name = 'ConvE_{0}_{1}'.format(Config.input_dropout, Config.dropout) +do_process = True +epochs = 1000 +Config.batch_size = 128 +load = False +#dataset_name = 'YAGO3-10' +#dataset_name = 'WN18RR' +dataset_name = 'FB15k-237' +model_path = 'saved_models/{0}_{1}.model'.format(dataset_name, model_name) + + +''' Preprocess knowledge graph using spodernet. ''' +def preprocess(dataset_name, delete_data=False): + full_path = 'data/{0}/e1rel_to_e2_full.json'.format(dataset_name) + train_path = 'data/{0}/e1rel_to_e2_train.json'.format(dataset_name) + dev_ranking_path = 'data/{0}/e1rel_to_e2_ranking_dev.json'.format(dataset_name) + test_ranking_path = 'data/{0}/e1rel_to_e2_ranking_test.json'.format(dataset_name) + + keys2keys = {} + keys2keys['e1'] = 'e1' # entities + keys2keys['rel'] = 'rel' # relations + keys2keys['e2'] = 'e1' # entities + keys2keys['e2_multi1'] = 'e1' # entity + keys2keys['e2_multi2'] = 'e1' # entity + input_keys = ['e1', 'rel', 'e2', 'e2_multi1', 'e2_multi2'] + d = DatasetStreamer(input_keys) + d.add_stream_processor(JsonLoaderProcessors()) + d.add_stream_processor(DictKey2ListMapper(input_keys)) + + # process full vocabulary and save it to disk + d.set_path(full_path) + p = Pipeline(dataset_name, delete_data, keys=input_keys, skip_transformation=True) + p.add_sent_processor(ToLower()) + p.add_sent_processor(CustomTokenizer(lambda x: x.split(' ')),keys=['e2_multi1', 'e2_multi2']) + p.add_token_processor(AddToVocab()) + p.execute(d) + p.save_vocabs() + + + # process train, dev and test sets and save them to hdf5 + p.skip_transformation = False + for path, name in zip([train_path, dev_ranking_path, test_ranking_path], ['train', 'dev_ranking', 'test_ranking']): + d.set_path(path) + p.clear_processors() + p.add_sent_processor(ToLower()) + p.add_sent_processor(CustomTokenizer(lambda x: x.split(' ')),keys=['e2_multi1', 'e2_multi2']) + p.add_post_processor(ConvertTokenToIdx(keys2keys=keys2keys), keys=['e1', 'rel', 'e2', 'e2_multi1', 'e2_multi2']) + p.add_post_processor(StreamToHDF5(name, samples_per_file=1000, keys=input_keys)) + p.execute(d) + + +def main(): + if do_process: preprocess(dataset_name, delete_data=True) + input_keys = ['e1', 'rel', 'e2', 'e2_multi1', 'e2_multi2'] + p = Pipeline(dataset_name, keys=input_keys) + p.load_vocabs() + vocab = p.state['vocab'] + + num_entities = vocab['e1'].num_token + + train_batcher = StreamBatcher(dataset_name, 'train', Config.batch_size, randomize=True, keys=input_keys) + dev_rank_batcher = StreamBatcher(dataset_name, 'dev_ranking', Config.batch_size, randomize=False, loader_threads=4, keys=input_keys, is_volatile=True) + test_rank_batcher = StreamBatcher(dataset_name, 'test_ranking', Config.batch_size, randomize=False, loader_threads=4, keys=input_keys, is_volatile=True) + + + #model = Complex(vocab['e1'].num_token, vocab['rel'].num_token) + #model = DistMult(vocab['e1'].num_token, vocab['rel'].num_token) + model = ConvE(vocab['e1'].num_token, vocab['rel'].num_token) + + train_batcher.at_batch_prepared_observers.insert(1,TargetIdx2MultiTarget(num_entities, 'e2_multi1', 'e2_multi1_binary')) + + + eta = ETAHook('train', print_every_x_batches=100) + train_batcher.subscribe_to_events(eta) + train_batcher.subscribe_to_start_of_epoch_event(eta) + train_batcher.subscribe_to_events(LossHook('train', print_every_x_batches=100)) + + if Config.cuda: + model.cuda() + if load: + model_params = torch.load(model_path) + print(model) + print([(key, value.size()) for key, value in model_params.items()]) + model.load_state_dict(model_params) + model.eval() + ranking_and_hits(model, test_rank_batcher, vocab, 'test_evaluation') + ranking_and_hits(model, dev_rank_batcher, vocab, 'dev_evaluation') + else: + model.init() + + + opt = torch.optim.Adam(model.parameters(), lr=Config.learning_rate, weight_decay=Config.L2) + for epoch in range(epochs): + model.train() + for i, str2var in enumerate(train_batcher): + opt.zero_grad() + e1 = str2var['e1'] + rel = str2var['rel'] + e2_multi = str2var['e2_multi1_binary'].float() + # label smoothing + e2_multi = ((1.0-Config.label_smoothing_epsilon)*e2_multi) + (1.0/e2_multi.size(1)) + + pred = model.forward(e1, rel) + loss = model.loss(pred, e2_multi) + loss.backward() + opt.step() + + train_batcher.state.loss = loss + + + print('saving to {0}'.format(model_path)) + torch.save(model.state_dict(), model_path) + + model.eval() + ranking_and_hits(model, dev_rank_batcher, vocab, 'dev_evaluation') + if epoch % 3 == 0: + if epoch > 0: + ranking_and_hits(model, test_rank_batcher, vocab, 'test_evaluation') + + +if __name__ == '__main__': + main() diff --git a/model.py b/model.py new file mode 100644 index 0000000..4d11744 --- /dev/null +++ b/model.py @@ -0,0 +1,127 @@ +import torch +from torch.nn import functional as F, Parameter +from torch.autograd import Variable + + +from spodernet.utils.global_config import Config +from spodernet.utils.cuda_utils import CUDATimer +from torch.nn.init import xavier_normal, xavier_uniform +from spodernet.utils.cuda_utils import CUDATimer +from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence + +timer = CUDATimer() + + +class Complex(torch.nn.Module): + def __init__(self, num_entities, num_relations): + super(Complex, self).__init__() + self.num_entities = num_entities + self.emb_e_real = torch.nn.Embedding(num_entities, Config.embedding_dim, padding_idx=0) + self.emb_e_img = torch.nn.Embedding(num_entities, Config.embedding_dim, padding_idx=0) + self.emb_rel_real = torch.nn.Embedding(num_relations, Config.embedding_dim, padding_idx=0) + self.emb_rel_img = torch.nn.Embedding(num_relations, Config.embedding_dim, padding_idx=0) + self.inp_drop = torch.nn.Dropout(Config.input_dropout) + self.loss = torch.nn.BCELoss() + + def init(self): + xavier_normal(self.emb_e_real.weight.data) + xavier_normal(self.emb_e_img.weight.data) + xavier_normal(self.emb_rel_real.weight.data) + xavier_normal(self.emb_rel_img.weight.data) + + def forward(self, e1, rel): + + e1_embedded_real = self.inp_drop(self.emb_e_real(e1)).view(Config.batch_size, -1) + rel_embedded_real = self.inp_drop(self.emb_rel_real(rel)).view(Config.batch_size, -1) + e1_embedded_img = self.inp_drop(self.emb_e_img(e1)).view(Config.batch_size, -1) + rel_embedded_img = self.inp_drop(self.emb_rel_img(rel)).view(Config.batch_size, -1) + + e1_embedded_real = self.inp_drop(e1_embedded_real) + rel_embedded_real = self.inp_drop(rel_embedded_real) + e1_embedded_img = self.inp_drop(e1_embedded_img) + rel_embedded_img = self.inp_drop(rel_embedded_img) + + # complex space bilinear product (equivalent to HolE) + realrealreal = torch.mm(e1_embedded_real*rel_embedded_real, self.emb_e_real.weight.transpose(1,0)) + realimgimg = torch.mm(e1_embedded_real*rel_embedded_img, self.emb_e_img.weight.transpose(1,0)) + imgrealimg = torch.mm(e1_embedded_img*rel_embedded_real, self.emb_e_img.weight.transpose(1,0)) + imgimgreal = torch.mm(e1_embedded_img*rel_embedded_img, self.emb_e_real.weight.transpose(1,0)) + pred = realrealreal + realimgimg + imgrealimg - imgimgreal + pred = F.sigmoid(pred) + + return pred + + +class DistMult(torch.nn.Module): + def __init__(self, num_entities, num_relations): + super(DistMult, self).__init__() + self.emb_e = torch.nn.Embedding(num_entities, Config.embedding_dim, padding_idx=0) + self.emb_rel = torch.nn.Embedding(num_relations, Config.embedding_dim, padding_idx=0) + self.inp_drop = torch.nn.Dropout(Config.input_dropout) + self.loss = torch.nn.BCELoss() + + def init(self): + xavier_normal(self.emb_e.weight.data) + xavier_normal(self.emb_rel.weight.data) + + def forward(self, e1, rel): + e1_embedded= self.emb_e(e1) + rel_embedded= self.emb_rel(rel) + e1_embedded = e1_embedded.view(-1, Config.embedding_dim) + rel_embedded = rel_embedded.view(-1, Config.embedding_dim) + + e1_embedded = self.inp_drop(e1_embedded) + rel_embedded = self.inp_drop(rel_embedded) + + pred = torch.mm(e1_embedded*rel_embedded, self.emb_e.weight.transpose(1,0)) + pred = F.sigmoid(pred) + + return pred + + + +class ConvE(torch.nn.Module): + def __init__(self, num_entities, num_relations): + super(ConvE, self).__init__() + self.emb_e = torch.nn.Embedding(num_entities, Config.embedding_dim, padding_idx=0) + self.emb_rel = torch.nn.Embedding(num_relations, Config.embedding_dim, padding_idx=0) + self.inp_drop = torch.nn.Dropout(Config.input_dropout) + self.hidden_drop = torch.nn.Dropout(Config.dropout) + self.feature_map_drop = torch.nn.Dropout2d(Config.feature_map_dropout) + self.loss = torch.nn.BCELoss() + + self.conv1 = torch.nn.Conv2d(1, 8, (3, 3), 1, 0, bias=Config.use_bias) + self.bn0 = torch.nn.BatchNorm2d(1) + self.bn1 = torch.nn.BatchNorm2d(8) + self.bn2 = torch.nn.BatchNorm1d(Config.embedding_dim) + self.register_parameter('b', Parameter(torch.zeros(num_entities))) + self.fc = torch.nn.Linear(2592,Config.embedding_dim) + print(num_entities, num_relations) + + def init(self): + xavier_normal(self.emb_e.weight.data) + xavier_normal(self.emb_rel.weight.data) + + def forward(self, e1, rel): + e1_embedded= self.emb_e(e1).view(Config.batch_size, 1, 10, 20) + rel_embedded = self.emb_rel(rel).view(Config.batch_size, 1, 10, 20) + + stacked_inputs = torch.cat([e1_embedded, rel_embedded], 2) + + stacked_inputs = self.bn0(stacked_inputs) + x= self.inp_drop(stacked_inputs) + x= self.conv1(x) + x= self.bn1(x) + x= F.relu(x) + x = self.feature_map_drop(x) + x = x.view(Config.batch_size, -1) + x = self.fc(x) + x = self.hidden_drop(x) + x = self.bn2(x) + x = F.relu(x) + x = torch.mm(x, self.emb_e.weight.transpose(1,0)) + x += self.b.expand_as(x) + pred = F.sigmoid(x) + + return pred + diff --git a/preprocess.sh b/preprocess.sh new file mode 100644 index 0000000..c5759b4 --- /dev/null +++ b/preprocess.sh @@ -0,0 +1,12 @@ +#!/bin/bash +mkdir data +mkdir data/WN18RR +mkdir data/YAGO3-10 +mkdir data/FB15k-237 +mkdir saved_models +tar -xvf WN18RR.tar.gz -C data/WN18RR +tar -xvf YAGO3-10.tar.gz -C data/YAGO3-10 +tar -xvf FB15k-237.tar.gz -C data/FB15k-237 +python wrangle_KG.py WN18RR +python wrangle_KG.py YAGO3-10 +python wrangle_KG.py FB15k-237 diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..e0ddebc --- /dev/null +++ b/requirements.txt @@ -0,0 +1 @@ +-e git@github.com:TimDettmers/spodernet.git#egg=spodernet diff --git a/training.py b/training.py new file mode 100644 index 0000000..5d6a3e1 --- /dev/null +++ b/training.py @@ -0,0 +1,107 @@ +import torch +import numpy as np +import datetime + +from spodernet.utils.global_config import Config +from spodernet.utils.cuda_utils import CUDATimer +from spodernet.utils.logger import Logger +from torch.autograd import Variable +from sklearn import metrics + +#timer = CUDATimer() +log = Logger('training_{0}.py.txt'.format(datetime.datetime.now())) + +def ranking_and_hits(model, dev_rank_batcher, vocab, name): + log.info('') + log.info('-'*50) + log.info(name) + log.info('-'*50) + log.info('') + hits_left = [] + hits_right = [] + hits = [] + ranks = [] + ranks_left = [] + ranks_right = [] + mrr_left = [] + mrr_right = [] + rel2ranks = {} + for i in range(10): + hits_left.append([]) + hits_right.append([]) + hits.append([]) + + for i, str2var in enumerate(dev_rank_batcher): + e1 = str2var['e1'] + e2 = str2var['e2'] + rel = str2var['rel'] + e2_multi1 = str2var['e2_multi1'].float() + e2_multi2 = str2var['e2_multi2'].float() + pred1 = model.forward(e1, rel) + pred2 = model.forward(e2, rel) + pred1, pred2 = pred1.data, pred2.data + e1, e2 = e1.data, e2.data + e2_multi1, e2_multi2 = e2_multi1.data, e2_multi2.data + for i in range(Config.batch_size): + # these filters contain ALL labels + filter1 = e2_multi1[i].long() + filter2 = e2_multi2[i].long() + + # save the prediction that is relevant + target_value1 = pred1[i,e2[i, 0]] + target_value2 = pred2[i,e1[i, 0]] + + # zero all known cases (this are not interesting) + # this corresponds to the filtered setting + pred1[i][filter1] = 0.0 + pred2[i][filter2] = 0.0 + # write base the saved values + pred1[i][e2[i]] = target_value1 + pred2[i][e1[i]] = target_value2 + + + # sort and rank + max_values, argsort1 = torch.sort(pred1, 1, descending=True) + max_values, argsort2 = torch.sort(pred2, 1, descending=True) + + argsort1 = argsort1.cpu().numpy() + argsort2 = argsort2.cpu().numpy() + for i in range(Config.batch_size): + # find the rank of the target entities + rank1 = np.where(argsort1[i]==e2[i, 0])[0][0] + rank2 = np.where(argsort2[i]==e1[i, 0])[0][0] + # rank+1, since the lowest rank is rank 1 not rank 0 + ranks.append(rank1+1) + ranks_left.append(rank1+1) + ranks.append(rank2+1) + ranks_right.append(rank2+1) + + # this could be done more elegantly, but here you go + for hits_level in range(10): + if rank1 <= hits_level: + hits[hits_level].append(1.0) + hits_left[hits_level].append(1.0) + else: + hits[hits_level].append(0.0) + hits_left[hits_level].append(0.0) + + if rank2 <= hits_level: + hits[hits_level].append(1.0) + hits_right[hits_level].append(1.0) + else: + hits[hits_level].append(0.0) + hits_right[hits_level].append(0.0) + + dev_rank_batcher.state.loss = [0] + + for i in range(10): + log.info('Hits left @{0}: {1}'.format(i+1, np.mean(hits_left[i]))) + log.info('Hits right @{0}: {1}'.format(i+1, np.mean(hits_right[i]))) + log.info('Hits @{0}: {1}'.format(i+1, np.mean(hits[i]))) + log.info('Mean rank left: {0}', np.mean(ranks_left)) + log.info('Mean rank right: {0}', np.mean(ranks_right)) + log.info('Mean rank: {0}', np.mean(ranks)) + log.info('Mean reciprocal rank left: {0}', np.mean(1./np.array(ranks_left))) + log.info('Mean reciprocal rank right: {0}', np.mean(1./np.array(ranks_right))) + log.info('Mean reciprocal rank: {0}', np.mean(1./np.array(ranks))) + diff --git a/wrangle_KG.py b/wrangle_KG.py new file mode 100644 index 0000000..42cbb11 --- /dev/null +++ b/wrangle_KG.py @@ -0,0 +1,144 @@ +from __future__ import print_function +from os.path import join +import json + +import argparse +import datetime +import requests +import json +import urllib +import cPickle as pickle +import os +import numpy as np +import operator +import sys + +rdm = np.random.RandomState(234234) + +if len(sys.argv) > 1: + dataset_name = sys.argv[1] +else: + dataset_name = 'FB15k-237' + #dataset_name = 'FB15k' + #dataset_name = 'yago' + #dataset_name = 'WN18RR' + +print('Processing dataset {0}'.format(dataset_name)) + +rdm = np.random.RandomState(2342423) +base_path = 'data/{0}/'.format(dataset_name) +files = ['train.txt', 'valid.txt', 'test.txt'] + +data = [] +for p in files: + with open(join(base_path, p)) as f: + data = f.readlines() + data + + +def convert_mid(e): + if e in mid2data: + if 'name' in mid2data[e]: + return mid2data[e]['name'] + return e + + +egraph = {} +d_egraph = {} +d_egraph_sets = {} +test_cases = {} +e_rel_direction ={} +for p in files: + test_cases[p] = [] + d_egraph_sets[p] = {} + + +for p in files: + with open(join(base_path, p)) as f: + for i, line in enumerate(f): + e1, rel, e2 = line.split('\t') + e1 = e1.strip() + e2 = e2.strip() + rel = rel.strip() + + if (e1 , rel) not in d_egraph: + d_egraph[(e1, rel)] = set() + + if (e2, rel) not in d_egraph: + d_egraph[(e2, rel)] = set() + + if (e1, rel) not in d_egraph_sets[p]: + d_egraph_sets[p][(e1, rel)] = set() + if (e2, rel) not in d_egraph_sets[p]: + d_egraph_sets[p][(e2, rel)] = set() + + if e1+rel not in e_rel_direction: + e_rel_direction[e1+rel] = 'left' + else: + e_rel_direction[e1+rel] = 'bidirectional' + + if e2+rel not in e_rel_direction: + e_rel_direction[e2+rel] = 'right' + else: + e_rel_direction[e2+rel] == 'bidirectional' + + d_egraph[(e1, rel)].add(e2) + d_egraph[(e2, rel)].add(e1) + test_cases[p].append([e1, rel, e2]) + d_egraph_sets[p][(e1, rel)].add(e2) + d_egraph_sets[p][(e2, rel)].add(e1) + + +#print('largest entities relations:') +#for i in range(10): +# print(sorted_x[i]) + +def write_e1rel_graph(cases, graph, path): + with open(path, 'w') as f: + n = len(graph) + for i, key in enumerate(graph): + e1, rel = key + entities = list(graph[key]) + direction = e_rel_direction[e1+rel] + + entities1 = " ".join(entities) + + data_point = {} + data_point['e1'] = e1 + data_point['e2'] = str(rdm.choice(entities)) + data_point['rel'] = rel + data_point['direction1'] = direction + data_point['direction2'] = 'none' + data_point['e2_multi1'] = entities1 + data_point['e2_multi2'] = "None" + + f.write(json.dumps(data_point) + '\n') + +def write_e1rel_ranking_graph(cases, graph, path): + with open(path, 'w') as f: + n = len(cases) + for i, (e1, rel, e2) in enumerate(cases): + entities1 = list(graph[(e1, rel)]) + entities2 = list(graph[(e2, rel)]) + direction1 = e_rel_direction[e1+rel] + direction2 = e_rel_direction[e2+rel] + + entities1 = " ".join(entities1) + entities2 = " ".join(entities2) + + data_point = {} + data_point['e1'] = e1 + data_point['e2'] = e2 + data_point['rel'] = rel + data_point['direction1'] = direction1 + data_point['direction2'] = direction2 + data_point['e2_multi1'] = entities1 + data_point['e2_multi2'] = entities2 + + f.write(json.dumps(data_point) + '\n') + + +all_cases = test_cases['train.txt'] + test_cases['valid.txt'] + test_cases['test.txt'] +write_e1rel_graph(test_cases['train.txt'], d_egraph_sets['train.txt'], 'data/{0}/e1rel_to_e2_train.json'.format(dataset_name)) +write_e1rel_ranking_graph(test_cases['valid.txt'], d_egraph, join('data/{0}/e1rel_to_e2_ranking_dev.json'.format(dataset_name))) +write_e1rel_ranking_graph(test_cases['test.txt'], d_egraph, 'data/{0}/e1rel_to_e2_ranking_test.json'.format(dataset_name)) +write_e1rel_graph(all_cases, d_egraph, 'data/{0}/e1rel_to_e2_full.json'.format(dataset_name))