Skip to content

Commit

Permalink
add default user
Browse files Browse the repository at this point in the history
  • Loading branch information
Masahiro Nishiba committed Mar 13, 2019
1 parent a7dd2c9 commit ab899d5
Show file tree
Hide file tree
Showing 4 changed files with 101 additions and 24 deletions.
3 changes: 2 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -107,4 +107,5 @@ venv.bak/
.idea/

# redshells
resources/
resources/
./sandbox/
2 changes: 1 addition & 1 deletion redshells/model/gcmc_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -135,5 +135,5 @@ def _get_feature_size(values):
feature_size = _get_feature_size(features.values())
new_order, _ = zip(*list(sorted(order_map.items(), key=lambda x: x[1])))
sorted_features = np.array(list(map(lambda x: features.get(x, np.zeros(feature_size)), new_order)))
sorted_features = np.vstack([sorted_features, np.zeros(feature_size)])
sorted_features = np.vstack([np.zeros(feature_size), sorted_features])
return sorted_features.astype(np.float32)
49 changes: 27 additions & 22 deletions redshells/model/graph_convolutional_matrix_completion.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
import pandas as pd
import redshells
from redshells.model.early_stopping import EarlyStopping
from redshells.model.gcmc_dataset import GcmcDataset

logger = getLogger(__name__)

Expand Down Expand Up @@ -352,13 +353,14 @@ def __init__(self,
self.use_bias = use_bias
self.ignore_item_embedding = ignore_item_embedding
self.save_directory_path = save_directory_path
self.dataset = GCMCDataset(
self.dataset = GcmcDataset(
self.user_ids,
self.item_ids,
self.ratings,
self.test_size,
user_information=self.user_features,
item_information=self.item_features)
item_information=self.item_features,
min_user_click_count=5)
self.graph = None

def fit(self, try_count=1, decay_speed=10.) -> List[str]:
Expand All @@ -374,7 +376,7 @@ def fit(self, try_count=1, decay_speed=10.) -> List[str]:
learning_rate=self.learning_rate,
threshold=1e-4)

test_user_indices, test_item_indices, test_labels, test_ratings = self.dataset.test_data()
test_data = self.dataset.test_data()
report = []
with self.session.as_default():
self.session.run(tf.global_variables_initializer())
Expand All @@ -390,19 +392,20 @@ def fit(self, try_count=1, decay_speed=10.) -> List[str]:
self.session.run(iterator.initializer)
while True:
try:
_user_indices, _item_indices, _labels, _ratings = self.session.run(next_batch)
train_data = self.session.run(next_batch)
_rating_adjacency_matrix = [
self._eliminate(matrix, _user_indices, _item_indices) for matrix in rating_adjacency_matrix
self._eliminate(matrix, train_data['user'], train_data['item'])
for matrix in rating_adjacency_matrix
]
feed_dict = {
self.graph.input_learning_rate: early_stopping.learning_rate,
self.graph.input_dropout: self.dropout_rate,
self.graph.input_user: _user_indices,
self.graph.input_item: _item_indices,
self.graph.input_label: _labels,
self.graph.input_rating: _ratings,
self.graph.input_user_information: _user_indices,
self.graph.input_item_information: _item_indices,
self.graph.input_user: train_data['user'],
self.graph.input_item: train_data['item'],
self.graph.input_label: train_data['label'],
self.graph.input_rating: train_data['rating'],
self.graph.input_user_information: train_data['user_information'],
self.graph.input_item_information: train_data['item_information'],
}
feed_dict.update({
g: _convert_sparse_matrix_to_sparse_tensor(m)
Expand All @@ -419,12 +422,12 @@ def fit(self, try_count=1, decay_speed=10.) -> List[str]:
logger.info(report[-1])
feed_dict = {
self.graph.input_dropout: 0.0,
self.graph.input_user: test_user_indices,
self.graph.input_item: test_item_indices,
self.graph.input_label: test_labels,
self.graph.input_rating: test_ratings,
self.graph.input_user_information: test_user_indices,
self.graph.input_item_information: test_item_indices,
self.graph.input_user: test_data['user'],
self.graph.input_item: test_data['item'],
self.graph.input_label: test_data['label'],
self.graph.input_rating: test_data['rating'],
self.graph.input_user_information: test_data['user_information'],
self.graph.input_item_information: test_data['item_information'],
}
feed_dict.update({
g: _convert_sparse_matrix_to_sparse_tensor(m)
Expand All @@ -447,12 +450,14 @@ def predict(self, user_ids: List, item_ids: List) -> np.ndarray:
RuntimeError('Please call fit first.')

rating_adjacency_matrix = self.dataset.train_rating_adjacency_matrix()
user_indices, item_indices = self.dataset.convert(user_ids, item_ids)
user_indices, item_indices = self.dataset.to_indices(user_ids, item_ids)
valid_indices = np.logical_and(user_indices != -1, item_indices != -1)
feed_dict = {
self.graph.input_dropout: 0.0,
self.graph.input_user: user_indices[valid_indices],
self.graph.input_item: item_indices[valid_indices],
self.graph.input_user_information: user_indices[valid_indices],
self.graph.input_item_information: item_indices[valid_indices],
}
feed_dict.update({
g: _convert_sparse_matrix_to_sparse_tensor(m)
Expand All @@ -468,7 +473,7 @@ def predict(self, user_ids: List, item_ids: List) -> np.ndarray:
return predictions

def predict_item_scores(self, item_ids: List) -> pd.DataFrame:
user_ids = list(self.dataset.user2index.keys())
user_ids = list(self.dataset.user_id_map.id2index.keys())
_test_users, _test_items = zip(*list(itertools.product(user_ids, item_ids)))
predicts = self.predict(user_ids=_test_users, item_ids=_test_items)
results = pd.DataFrame(dict(user=_test_users, item=_test_items, score=predicts))
Expand All @@ -477,9 +482,9 @@ def predict_item_scores(self, item_ids: List) -> pd.DataFrame:

def _make_graph(self) -> GraphConvolutionalMatrixCompletionGraph:
return GraphConvolutionalMatrixCompletionGraph(
n_rating=len(self.dataset.rating2index),
n_user=len(self.dataset.user2index),
n_item=len(self.dataset.item2index),
n_rating=len(self.dataset.rating_id_map.id2index),
n_user=len(self.dataset.user_id_map.id2index) + 1, # TODO
n_item=len(self.dataset.item_id_map.id2index) + 1, # TODO
rating=self.dataset.rating(),
normalization_type=self.normalization_type,
encoder_hidden_size=self.encoder_hidden_size,
Expand Down
71 changes: 71 additions & 0 deletions sandbox/test_dataset.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,71 @@
import itertools
from builtins import sorted
from collections import Counter
from logging import getLogger
from typing import List, Optional, Dict, Tuple, Any

import numpy as np
import scipy.sparse as sp
import sklearn
import tensorflow as tf
import pandas as pd
import redshells
from redshells.model.early_stopping import EarlyStopping
from redshells.model.gcmc_dataset import GcmcDataset
from redshells.model.graph_convolutional_matrix_completion import GCMCDataset

logger = getLogger(__name__)


def _make_sparse_matrix(n, m, n_values):
x = np.zeros(shape=(n, m), dtype=np.float32)
x[np.random.choice(range(n), n_values), np.random.choice(range(m), n_values)] = 1.0
return sp.csr_matrix(x)


def main():
np.random.seed(12)
n_users = 101
n_items = 233
n_data = 3007
n_features = 21
test_size = 0.2
adjacency_matrix = _make_sparse_matrix(n_users, n_items, n_data) + 2 * _make_sparse_matrix(n_users, n_items, n_data)
user_ids = adjacency_matrix.tocoo().row
item_ids = adjacency_matrix.tocoo().col
ratings = adjacency_matrix.tocoo().data
item_features = dict(zip(range(n_items), np.random.uniform(size=(n_items, n_features))))

np.random.seed(34)
dataset0 = GCMCDataset(
user_ids, item_ids, ratings, test_size, user_information=None, item_information=item_features)

np.random.seed(34)
dataset1 = GcmcDataset(
user_ids, item_ids, ratings, test_size, user_information=None, item_information=item_features)

import IPython
IPython.embed()
dataset0.user2index
dataset1.user_id_map.id2index

dataset0.item2index
dataset1.item_id_map.id2index

(dataset0.item_indices + 1 - dataset1.item_indices).max()
dataset1.user_indices

(dataset0.rating_indices - dataset1.rating_indices).max()

(dataset0.item_indices + 1 - dataset1.item_information_indices).max()

(dataset0.ratings - dataset1.ratings).max()

(dataset0.train_indices.astype(int) - dataset1.train_indices.astype(int)).max()

dataset0.item_information
dataset1.item_information


if __name__ == '__main__':
main()

0 comments on commit ab899d5

Please sign in to comment.