Skip to content

Commit

Permalink
Warm and cold users support (#77)
Browse files Browse the repository at this point in the history
- supported warm users and items in `Dataset`
- removed `return_external_ids` parameter in `recommend` and
`recommend_to_items` methods
- supported cold and warm targets in base model
- supported new dataset in cross validation

The first part of
#87
  • Loading branch information
feldlime authored Apr 2, 2024
1 parent eed8e69 commit 51cbd8d
Show file tree
Hide file tree
Showing 43 changed files with 1,423 additions and 565 deletions.
4 changes: 2 additions & 2 deletions .github/workflows/test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ jobs:
uses: actions/cache@v3
with:
path: .venv
key: venv-${{ runner.os }}-${{ hashFiles('**/poetry.lock') }}
key: venv-${{ runner.os }}-3.8-${{ hashFiles('**/poetry.lock') }}

- name: Install dependencies
if: steps.cached-poetry-dependencies.outputs.cache-hit != 'true'
Expand Down Expand Up @@ -65,7 +65,7 @@ jobs:
uses: actions/cache@v3
with:
path: .venv
key: venv-${{ runner.os }}-${{ hashFiles('**/poetry.lock') }}
key: venv-${{ runner.os }}-${{ matrix.python-version }}-old-deps-${{ matrix.old-deps }}-${{ hashFiles('**/poetry.lock') }}

- name: Install dependencies
if: steps.cached-poetry-dependencies.outputs.cache-hit != 'true'
Expand Down
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,3 +1,6 @@
# MacOS
.DS_Store

# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
Expand Down
3 changes: 2 additions & 1 deletion .pylintrc
Original file line number Diff line number Diff line change
Expand Up @@ -73,6 +73,7 @@ disable=arguments-differ,
unused-argument,
use-implicit-booleaness-not-comparison,
use-symbolic-message-instead,
abstract-method

# Enable the message, report, category or checker with the given id(s).
# You can either give multiple identifier separated by comma (,) or
Expand Down Expand Up @@ -446,7 +447,7 @@ max-args=15
max-attributes=12

# Maximum number of boolean expressions in an if statement (see R0916).
max-bool-expr=2
max-bool-expr=3

# Maximum number of branch for function / method body.
max-branches=9
Expand Down
11 changes: 11 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,17 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).


## [0.6.0] - Unreleased

### Added
- Warm users/items support in `Dataset` ([#77](https://github.com/MobileTeleSystems/RecTools/pull/77))
- Warm and cold users/items support in `ModelBase` ([#77](https://github.com/MobileTeleSystems/RecTools/pull/77))
- Warm and cold users/items support in `cross_validate` ([#77](https://github.com/MobileTeleSystems/RecTools/pull/77))

### Removed
- `return_external_ids` parameter in `recommend` and `recommend_to_items` model methods ([#77](https://github.com/MobileTeleSystems/RecTools/pull/77))


## [0.5.0] - 22.03.2024

### Added
Expand Down
2 changes: 1 addition & 1 deletion poetry.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,7 @@ tqdm = "^4.27.0"
implicit = "^0.7.1"
attrs = ">=19.1.0,<24.0.0"
typeguard = "^2.0.1"
typing-extensions = "4.7.1" # TODO: remove after dropping support for python 3.7


lightfm = {version = ">=1.16,<=1.17", optional = true}
Expand Down
68 changes: 52 additions & 16 deletions rectools/dataset/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@

from rectools import Columns

from .features import AbsentIdError, DenseFeatures, Features, SparseFeatures, UnknownIdError
from .features import AbsentIdError, DenseFeatures, Features, SparseFeatures
from .identifiers import IdMap
from .interactions import Interactions

Expand All @@ -36,8 +36,8 @@ class Dataset:
user-item interactions, user and item features
in special `rectools` structures for convenient future usage.
This is data class, so you can create it explicitly, but
it's recommended to use `construct` method.
WARNING: It's highly not recommended to create `Dataset` object directly.
Use `construct` class method instead.
Parameters
----------
Expand All @@ -59,6 +59,38 @@ class Dataset:
user_features: tp.Optional[Features] = attr.ib(default=None)
item_features: tp.Optional[Features] = attr.ib(default=None)

@property
def n_hot_users(self) -> int:
"""
Return number of hot users in dataset.
Users with internal ids from `0` to `n_hot_users - 1` are hot (they are present in interactions).
Users with internal ids from `n_hot_users` to `dataset.user_id_map.size - 1` are warm
(they aren't present in interactions, but they have features).
"""
return self.interactions.df[Columns.User].max() + 1

@property
def n_hot_items(self) -> int:
"""
Return number of hot items in dataset.
Items with internal ids from `0` to `n_hot_items - 1` are hot (they are present in interactions).
Items with internal ids from `n_hot_items` to `dataset.item_id_map.size - 1` are warm
(they aren't present in interactions, but they have features).
"""
return self.interactions.df[Columns.Item].max() + 1

def get_hot_user_features(self) -> tp.Optional[Features]:
"""User features for hot users."""
if self.user_features is None:
return None
return self.user_features.take(range(self.n_hot_users))

def get_hot_item_features(self) -> tp.Optional[Features]:
"""Item features for hot items."""
if self.item_features is None:
return None
return self.item_features.take(range(self.n_hot_items))

@classmethod
def construct(
cls,
Expand Down Expand Up @@ -112,15 +144,16 @@ def construct(
user_id_map = IdMap.from_values(interactions_df[Columns.User].values)
item_id_map = IdMap.from_values(interactions_df[Columns.Item].values)
interactions = Interactions.from_raw(interactions_df, user_id_map, item_id_map)
user_features = cls._make_features(

user_features, user_id_map = cls._make_features(
user_features_df,
cat_user_features,
make_dense_user_features,
user_id_map,
Columns.User,
"user",
)
item_features = cls._make_features(
item_features, item_id_map = cls._make_features(
item_features_df,
cat_item_features,
make_dense_item_features,
Expand All @@ -138,32 +171,30 @@ def _make_features(
id_map: IdMap,
possible_id_col: str,
feature_type: str,
) -> tp.Optional[Features]:
) -> tp.Tuple[tp.Optional[Features], IdMap]:
if df is None:
return None
return None, id_map

id_col = possible_id_col if possible_id_col in df else "id"
id_map = id_map.add_ids(df[id_col].values, raise_if_already_present=False)

if make_dense:
try:
return DenseFeatures.from_dataframe(df, id_map, id_col=id_col)
except UnknownIdError:
raise ValueError(f"Some ids from {feature_type} features table not present in interactions")
return DenseFeatures.from_dataframe(df, id_map, id_col=id_col), id_map
except AbsentIdError:
raise ValueError(
f"An error has occurred while constructing {feature_type} features: "
"When using dense features all ids from interactions must present in features table"
"When using dense features all ids from interactions must be present in features table"
)
except Exception as e: # pragma: no cover
raise RuntimeError(f"An error has occurred while constructing {feature_type} features: {e!r}")

try:
return SparseFeatures.from_flatten(df, id_map, cat_features, id_col=id_col)
except UnknownIdError:
raise ValueError(f"Some ids from {feature_type} features table not present in interactions")
return SparseFeatures.from_flatten(df, id_map, cat_features, id_col=id_col), id_map
except Exception as e: # pragma: no cover
raise RuntimeError(f"An error has occurred while constructing {feature_type} features: {e!r}")

def get_user_item_matrix(self, include_weights: bool = True) -> sparse.csr_matrix:
def get_user_item_matrix(self, include_weights: bool = True, include_warm: bool = False) -> sparse.csr_matrix:
"""
Construct user-item CSR matrix based on `interactions` attribute.
Expand All @@ -177,14 +208,19 @@ def get_user_item_matrix(self, include_weights: bool = True) -> sparse.csr_matri
include_weights : bool, default ``True``
Whether include interaction weights in matrix or not.
If False, all values in returned matrix will be equal to ``1``.
include_warm : bool, default ``False``
Whether to include warm users and items into the matrix or not.
Rows and columns for warm users and items will be added to the end of matrix,
they will contain only zeros.
Returns
-------
csr_matrix
Resized user-item CSR matrix
"""
matrix = self.interactions.get_user_item_matrix(include_weights)
matrix.resize(self.user_id_map.internal_ids.size, self.item_id_map.internal_ids.size)
if include_warm:
matrix.resize(self.user_id_map.size, self.item_id_map.size)
return matrix

def get_raw_interactions(self, include_weight: bool = True, include_datetime: bool = True) -> pd.DataFrame:
Expand Down
8 changes: 8 additions & 0 deletions rectools/dataset/features.py
Original file line number Diff line number Diff line change
Expand Up @@ -160,6 +160,10 @@ def take(self, ids: InternalIds) -> "DenseFeatures":
names=self.names,
)

def __len__(self) -> int:
"""Return number of objects."""
return self.values.shape[0]


SparseFeatureName = tp.Tuple[str, tp.Any]

Expand Down Expand Up @@ -442,5 +446,9 @@ def take(self, ids: InternalIds) -> "SparseFeatures":
names=self.names,
)

def __len__(self) -> int:
"""Return number of objects."""
return self.values.shape[0]


Features = tp.Union[DenseFeatures, SparseFeatures]
64 changes: 57 additions & 7 deletions rectools/dataset/identifiers.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
import attr
import numpy as np
import pandas as pd
import typing_extensions as tpe

from rectools import ExternalId, ExternalIds, InternalId, InternalIds
from rectools.utils import fast_isin, get_from_series_by_index
Expand Down Expand Up @@ -97,6 +98,11 @@ def size(self) -> int:
"""Return number of ids in map."""
return self.external_ids.size

@property
def external_dtype(self) -> tp.Type:
"""Return dtype of external ids."""
return self.external_ids.dtype

@property
def to_internal(self) -> pd.Series:
"""Map internal->external."""
Expand All @@ -120,7 +126,21 @@ def get_external_sorted_by_internal(self) -> np.ndarray:
"""Return array of external ids sorted by internal ids."""
return self.external_ids

def convert_to_internal(self, external: ExternalIds, strict: bool = True) -> np.ndarray:
@tp.overload
def convert_to_internal( # noqa: D102
self, external: ExternalIds, strict: bool = ..., return_missing: tpe.Literal[False] = False
) -> np.ndarray: # pragma: no cover
...

@tp.overload
def convert_to_internal( # noqa: D102
self, external: ExternalIds, strict: bool = ..., *, return_missing: tpe.Literal[True]
) -> tp.Tuple[np.ndarray, np.ndarray]: # pragma: no cover
...

def convert_to_internal(
self, external: ExternalIds, strict: bool = True, return_missing: bool = False
) -> tp.Union[np.ndarray, tp.Tuple[np.ndarray, np.ndarray]]:
"""
Convert any sequence of external ids to array of internal ids (map external -> internal).
Expand All @@ -132,21 +152,43 @@ def convert_to_internal(self, external: ExternalIds, strict: bool = True) -> np.
Defines behaviour when some of given external ids do not exist in mapping.
- If ``True``, `KeyError` will be raised;
- If ``False``, nonexistent ids will be skipped.
return_missing : bool, default ``False``
If True, return a tuple of 2 arrays: internal ids and missing ids (that are not in map).
Works only if `strict` is False.
Returns
-------
np.ndarray
Array of internal ids.
np.ndarray, np.ndarray
Tuple of 2 arrays: internal ids and missing ids.
Only if `strict` is False and `return_missing` is True.
Raises
------
KeyError
If some of given external ids do not exist in mapping and `strict` flag is ``True``.
ValueError
If `strict` and `return_missing` are both ``True``.
"""
internal = get_from_series_by_index(self.to_internal, external, strict)
return internal

def convert_to_external(self, internal: InternalIds, strict: bool = True) -> np.ndarray:
result = get_from_series_by_index(self.to_internal, external, strict, return_missing)
return result

@tp.overload
def convert_to_external( # noqa: D102
self, internal: InternalIds, strict: bool = ..., return_missing: tpe.Literal[False] = False
) -> np.ndarray: # pragma: no cover
...

@tp.overload
def convert_to_external( # noqa: D102
self, internal: InternalIds, strict: bool = ..., *, return_missing: tpe.Literal[True]
) -> tp.Tuple[np.ndarray, np.ndarray]: # pragma: no cover
...

def convert_to_external(
self, internal: InternalIds, strict: bool = True, return_missing: bool = False
) -> tp.Union[np.ndarray, tp.Tuple[np.ndarray, np.ndarray]]:
"""
Convert any sequence of internal ids to array of external ids (map internal -> external).
Expand All @@ -158,19 +200,27 @@ def convert_to_external(self, internal: InternalIds, strict: bool = True) -> np.
Defines behaviour when some of given internal ids do not exist in mapping.
- If ``True``, `KeyError` will be raised;
- If ``False``, nonexistent ids will be skipped.
return_missing : bool, default ``False``
If True, return a tuple of 2 arrays: external ids and missing ids (that are not in map).
Works only if `strict` is False.
Returns
-------
np.ndarray
Array of external ids.
np.ndarray, np.ndarray
Tuple of 2 arrays: external ids and missing ids.
Only if `strict` is False and `return_missing` is True.
Raises
------
KeyError
If some of given internal ids do not exist in mapping and `strict` flag is True.
ValueError
If `strict` and `return_missing` are both ``True``.
"""
external = get_from_series_by_index(self.to_external, internal, strict)
return external
result = get_from_series_by_index(self.to_external, internal, strict, return_missing)
return result

def add_ids(self, values: ExternalIds, raise_if_already_present: bool = False) -> "IdMap":
"""
Expand Down
16 changes: 9 additions & 7 deletions rectools/dataset/torch_datasets.py
Original file line number Diff line number Diff line change
Expand Up @@ -70,15 +70,15 @@ def __init__(
@classmethod
def from_dataset(cls: tp.Type[DD], dataset: Dataset) -> DD:
ui_matrix = dataset.get_user_item_matrix()
if dataset.item_features is not None:
item_features = dataset.item_features.get_sparse()
else:

# We take hot here since this dataset is used for fit only
item_features = dataset.get_hot_item_features()
user_features = dataset.get_hot_user_features()
if item_features is None:
raise AttributeError("Item features attribute of dataset could not be None")
if dataset.user_features is not None:
user_features = dataset.user_features.get_sparse()
else:
if user_features is None:
raise AttributeError("User features attribute of dataset could not be None")
return cls(items=item_features, users=user_features, interactions=ui_matrix)
return cls(items=item_features.get_sparse(), users=user_features.get_sparse(), interactions=ui_matrix)

def __len__(self) -> int:
return self.interactions.shape[0]
Expand Down Expand Up @@ -114,6 +114,7 @@ def __init__(self, items: sparse.csr_matrix):

@classmethod
def from_dataset(cls: tp.Type[ID], dataset: Dataset) -> ID:
# We take all features here since this dataset is used for recommend only, not for fit
if dataset.item_features is not None:
return cls(dataset.item_features.get_sparse())
raise AttributeError("Item features attribute of dataset could not be None")
Expand Down Expand Up @@ -155,6 +156,7 @@ def from_dataset(
dataset: Dataset,
keep_users: tp.Optional[tp.Sequence[int]] = None,
) -> UD:
# We take all features here since this dataset is used for recommend only, not for fit
if dataset.user_features is not None:
return cls(
dataset.user_features.get_sparse(),
Expand Down
Loading

0 comments on commit 51cbd8d

Please sign in to comment.