From fcc2816d0a2f71992051f9097dd8a07ddedc03aa Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?David-Elias=20K=C3=BCnstle?= Date: Wed, 30 Jun 2021 10:39:59 +0200 Subject: [PATCH] More datasets (#20) Datasets: Car, ImageNet v0.1 and v0.2, Things, Nature, Vogue, Material * Add utility to transform odd-one-out, n-select, and n-rank queries to triplets. * Add preprocessing methods for queries from object attributes * Fix dangling url for musicsim dataset * Use remote-data directive to indicate tests that depend on the internet, disable by default. * Add h5py to CI in order to run dataset tests --- .github/workflows/python-package.yml | 3 +- README.md | 25 ++- cblearn/datasets/__init__.py | 8 +- cblearn/datasets/_car_similarity.py | 132 +++++++++++++ cblearn/datasets/_food_similarity.py | 11 +- cblearn/datasets/_imagenet_similarity.py | 157 +++++++++++++++ cblearn/datasets/_material_similarity.py | 143 ++++++++++++++ cblearn/datasets/_musician_similarity.py | 17 +- cblearn/datasets/_nature_vogue_similarity.py | 170 ++++++++++++++++ cblearn/datasets/_things_similarity.py | 121 ++++++++++++ cblearn/datasets/descr/car_similarity.rst | 33 ++++ .../datasets/descr/imagenet_similarity.rst | 35 ++++ .../datasets/descr/material_similarity.rst | 30 +++ .../datasets/descr/musician_similarity.rst | 2 +- .../descr/nature_vogue_similarity.rst | 32 +++ cblearn/datasets/descr/things_similarity.rst | 33 ++++ .../datasets/tests/test_food_similarity.py | 2 +- .../tests/test_musician_similarity.py | 10 +- cblearn/embedding/wrapper/_mlds.py | 3 + cblearn/embedding/wrapper/_soe.py | 3 + cblearn/preprocessing/__init__.py | 5 + cblearn/preprocessing/_label.py | 187 ++++++++++++++++++ cblearn/preprocessing/_query.py | 100 ++++++++++ cblearn/preprocessing/tests/__init__.py | 0 cblearn/preprocessing/tests/test_query.py | 73 +++++++ docs/references/index.rst | 24 ++- docs/user_guide/index.rst | 5 + pyproject.toml | 3 +- setup.cfg | 5 +- 29 files changed, 1341 insertions(+), 31 deletions(-) create mode 100644 cblearn/datasets/_car_similarity.py create mode 100644 cblearn/datasets/_imagenet_similarity.py create mode 100644 cblearn/datasets/_material_similarity.py create mode 100644 cblearn/datasets/_nature_vogue_similarity.py create mode 100644 cblearn/datasets/_things_similarity.py create mode 100644 cblearn/datasets/descr/car_similarity.rst create mode 100644 cblearn/datasets/descr/imagenet_similarity.rst create mode 100644 cblearn/datasets/descr/material_similarity.rst create mode 100644 cblearn/datasets/descr/nature_vogue_similarity.rst create mode 100644 cblearn/datasets/descr/things_similarity.rst create mode 100644 cblearn/preprocessing/__init__.py create mode 100644 cblearn/preprocessing/_label.py create mode 100644 cblearn/preprocessing/_query.py create mode 100644 cblearn/preprocessing/tests/__init__.py create mode 100644 cblearn/preprocessing/tests/test_query.py diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml index 7f732ef..3f34dc6 100644 --- a/.github/workflows/python-package.yml +++ b/.github/workflows/python-package.yml @@ -32,6 +32,7 @@ jobs: run: | python3 -m pip install --upgrade pip pip install -e .[r_wrapper,tests,docs,torch] + pip install h5py - name: Lint with flake8 run: | # stop the build if there are Python syntax errors or undefined names @@ -47,7 +48,7 @@ jobs: flake8 . --count --exit-zero --statistics --show-source - name: Test with pytest and measure coverage run: | - pytest cblearn --cov=cblearn --cov-report=xml + pytest cblearn --cov=cblearn --cov-report=xml --remote-data - name: Upload coverage to Codecov uses: codecov/codecov-action@v1 with: diff --git a/README.md b/README.md index d51a7e1..4f4e326 100644 --- a/README.md +++ b/README.md @@ -46,9 +46,29 @@ Find more details in the [installation instructions](https://cblearn.readthedocs In the [User Guide](https://cblearn.readthedocs.io/en/latest/user_guide/index.html) you find a detailed introduction. -## Algorithms +## Features + +### Datasets + +*cblearn* provides utility methods to simplify the loading and conversation +of your comparison datasets. In addition, there are +functions that download and load multiple real world comparisons. + +| Dataset | Query | #Object | #Response | #Triplet | +| --- | --- | ---:| ---:| ---:| +| Vogue Cover | Odd-out Triplet | 60 | 1,107 | 2,214 | +| Nature Scene | Odd-out Triplet | 120 | 3,355 | 6,710 | +| Car | Most-Central Triplet | 60 | 7,097 | 14,194 | +| Material | Standard Triplet | 100 | 104,692 |104,692 | +| Food | Standard Triplet | 100 | 190,376 |190,376 | +| Musician | Standard Triplet | 413 | 224,792 |224,792 | +| Things Image Testset | Odd-out Triplet | 1,854 | 146,012 | 292,024 | +| ImageNet Images v0.1 | Rank 2 from 8 | 1,000 | 25,273 | 328,549 | +| ImageNet Images v0.2 | Rank 2 from 8 | 50,000 | 384,277 | 5M | + + +### Embedding Algorithms -### Embedding | Algorithm | Default | Pytorch (GPU) | Reference Wrapper | | --------------------------- | :---: | :-----------: | :---------------: | | Crowd Kernel Learning (CKL) | planned | X | | @@ -57,6 +77,7 @@ In the [User Guide](https://cblearn.readthedocs.io/en/latest/user_guide/index.ht | Maximum-Likelihood Difference Scaling (MLDS) | planned | | [MLDS (R)](https://cran.r-project.org/web/packages/MLDS/index.html)| | Soft Ordinal Embedding (SOE) | X | X | [loe (R)](https://cran.r-project.org/web/packages/loe/index.html) | | Stochastic Triplet Embedding (STE/t-STE) | planned | planned | planned | + ## Contribute We are happy about your contributions. diff --git a/cblearn/datasets/__init__.py b/cblearn/datasets/__init__.py index 70dca35..dfda1d2 100644 --- a/cblearn/datasets/__init__.py +++ b/cblearn/datasets/__init__.py @@ -1,5 +1,11 @@ from ._musician_similarity import fetch_musician_similarity from ._food_similarity import fetch_food_similarity +from ._material_similarity import fetch_material_similarity +from ._nature_vogue_similarity import fetch_nature_scene_similarity +from ._nature_vogue_similarity import fetch_vogue_cover_similarity +from ._things_similarity import fetch_things_similarity +from ._imagenet_similarity import fetch_imagenet_similarity +from ._car_similarity import fetch_car_similarity from ._triplet_simulation import make_all_triplets from ._triplet_simulation import make_random_triplets @@ -7,4 +13,4 @@ from ._triplet_indices import make_all_triplet_indices from ._triplet_indices import make_random_triplet_indices from ._triplet_answers import triplet_answers -from ._triplet_answers import noisy_triplet_answers +from ._triplet_answers import noisy_triplet_answers \ No newline at end of file diff --git a/cblearn/datasets/_car_similarity.py b/cblearn/datasets/_car_similarity.py new file mode 100644 index 0000000..8133988 --- /dev/null +++ b/cblearn/datasets/_car_similarity.py @@ -0,0 +1,132 @@ +from pathlib import Path +import logging +import joblib +import os +from typing import Optional, Union +import zipfile + +import numpy as np +from sklearn.datasets import _base +from sklearn.utils import check_random_state, Bunch + + +ARCHIVE = _base.RemoteFileMetadata( + filename='60_cars_data.zip', + url='http://www.tml.cs.uni-tuebingen.de/team/luxburg/code_and_data/60_cars_data.zip', + checksum=('5fa2ad932d48adf5cfe36bd16a08b25fd88d1519d974908f6ccbba769f629640')) + +logger = logging.getLogger(__name__) + + +def fetch_car_similarity(data_home: Optional[os.PathLike] = None, download_if_missing: bool = True, + shuffle: bool = True, random_state: Optional[np.random.RandomState] = None, + return_triplets: bool = False) -> Union[Bunch, np.ndarray]: + """ Load the 60-car dataset (most-central triplets). + + =================== ===================== + Triplets 7097 + Objects (Cars) 60 + Query 3 cars, most-central + =================== ===================== + + See :ref:`central_car_dataset` for a detailed description. + + >>> dataset = fetch_car_similarity(shuffle=False) # doctest: +REMOTE_DATA + >>> dataset.class_name.tolist() # doctest: +REMOTE_DATA + ['OFF-ROAD / SPORT UTILITY VEHICLES', 'ORDINARY CARS', 'OUTLIERS', 'SPORTS CARS'] + >>> dataset.triplet.shape # doctest: +REMOTE_DATA + (7097, 3) + + + Args: + data_home : optional, default: None + Specify another download and cache folder for the datasets. By default + all scikit-learn data is stored in '~/scikit_learn_data' subfolders. + download_if_missing : optional, default=True + shuffle: default = True + Shuffle the order of triplet constraints. + random_state: optional, default = None + Initialization for shuffle random generator + return_triplets : boolean, default=False. + If True, returns numpy array instead of a Bunch object. + + Returns: + dataset : :class:`~sklearn.utils.Bunch` + Dictionary-like object, with the following attributes. + + triplet : ndarray, shape (n_triplets, 3) + Each row corresponding a triplet constraint. + The columns represent the three indices shown per most-central question. + response : ndarray, shape (n_triplets, ) + The car per question (0, 1, or 2) that was selected as "most-central". + rt_ms : ndarray, shape (n_triplets, ) + Reaction time of the response in milliseconds. + class_id : np.ndarray (60, ) + The class assigned to each object. + class_name : list (4) + Names of the classes. + DESCR : string + Description of the dataset. + triplets : numpy array (n_triplets, 3) + Only present when `return_triplets=True`. + + Raises: + IOError: If the data is not locally available, but download_if_missing=False + """ + + data_home = Path(_base.get_data_home(data_home=data_home)) + if not data_home.exists(): + data_home.mkdir() + + filepath = Path(_base._pkl_filepath(data_home, 'car_centrality.pkz')) + if not filepath.exists(): + if not download_if_missing: + raise IOError("Data not found and `download_if_missing` is False") + + logger.info('Downloading 60-car dataset from {} to {}'.format(ARCHIVE.url, data_home)) + + archive_path = _base._fetch_remote(ARCHIVE, dirname=data_home) + with zipfile.ZipFile(archive_path) as zf: + with zf.open('60_cars_data/survey_data.csv', 'r') as f: + survey_data = np.loadtxt(f, dtype=str, delimiter=',', skiprows=1) + + joblib.dump(survey_data, filepath, compress=6) + os.remove(archive_path) + else: + survey_data = joblib.load(filepath) + + class_map = { + 'ORDINARY CARS': [2, 6, 7, 8, 9, 10, 11, 12, 16, 17, 25, 32, 35, 36, 37, 38, + 39, 41, 44, 45, 46, 55, 58, 60], + 'SPORTS CARS': [15, 19, 20, 28, 40, 42, 47, 48, 49, 50, 51, 52, 54, 56, 59], + 'OFF-ROAD / SPORT UTILITY VEHICLES': [1, 3, 4, 5, 13, 14, 18, 22, 24, 26, 27, + 29, 31, 33, 34, 43, 57], + 'OUTLIERS': [21, 23, 30, 53], + } + class_names = np.asarray(sorted(class_map.keys())) + classes = np.empty(60, dtype=int) + for cls_ix, cls_name in enumerate(class_names): + classes[np.array(class_map[cls_name]) - 1] = cls_ix + + if shuffle: + random_state = check_random_state(random_state) + shuffle_ix = random_state.permutation(len(survey_data)) + survey_data = survey_data[shuffle_ix] + + raw_triplets = survey_data[:, [2, 3, 4]].astype(int) + triplets = raw_triplets - 1 + response = (survey_data[:, [1]].astype(int) == raw_triplets).nonzero()[1] + rt_ms = survey_data[:, [5]].astype(float) + if return_triplets: + return triplets + + module_path = Path(__file__).parent + with module_path.joinpath('descr', 'car_similarity.rst').open() as rst_file: + fdescr = rst_file.read() + + return Bunch(triplet=triplets, + response=response, + rt_ms=rt_ms, + class_id=classes, + class_name=class_names, + DESCR=fdescr) diff --git a/cblearn/datasets/_food_similarity.py b/cblearn/datasets/_food_similarity.py index 2defcf6..da8dc44 100644 --- a/cblearn/datasets/_food_similarity.py +++ b/cblearn/datasets/_food_similarity.py @@ -65,9 +65,8 @@ def fetch_food_similarity(data_home: Optional[os.PathLike] = None, download_if_m if not data_home.exists(): data_home.mkdir() - triplet_filepath = Path(_base._pkl_filepath(data_home, 'food_similarity_triplets.pkz')) - image_filepath = Path(_base._pkl_filepath(data_home, 'food_similarity_images.pkz')) - if not triplet_filepath.exists() or not image_filepath.exists(): + filepath = Path(_base._pkl_filepath(data_home, 'food_similarity.pkz')) + if not filepath.exists(): if not download_if_missing: raise IOError("Data not found and `download_if_missing` is False") @@ -82,12 +81,10 @@ def fetch_food_similarity(data_home: Optional[os.PathLike] = None, download_if_m if name.startswith('food100-dataset/images/') and name.endswith('.jpg')]) - joblib.dump(triplets, triplet_filepath, compress=6) - joblib.dump(image_names, image_filepath, compress=6) + joblib.dump((triplets, image_names), filepath, compress=6) os.remove(archive_path) else: - triplets = joblib.load(triplet_filepath) - image_names = joblib.load(image_filepath) + triplets, image_names = joblib.load(filepath) image_names = np.sort(image_names) triplets = np.searchsorted(image_names, triplets) diff --git a/cblearn/datasets/_imagenet_similarity.py b/cblearn/datasets/_imagenet_similarity.py new file mode 100644 index 0000000..e194d82 --- /dev/null +++ b/cblearn/datasets/_imagenet_similarity.py @@ -0,0 +1,157 @@ +from pathlib import Path +import logging +import joblib +import os +from os.path import join +from typing import Optional, Union +from urllib.request import urlretrieve +import zipfile + +import numpy as np +from sklearn.datasets import _base +from sklearn.utils import check_random_state, Bunch + +ARCHIVE = _base.RemoteFileMetadata( + filename='osfstorage-archive.zip', + url='https://files.osf.io/v1/resources/7f96y/providers/osfstorage/?zip=', + checksum=('cannot check - zip involves randomness')) + +logger = logging.getLogger(__name__) +__doctest_requires__ = {'fetch_imagenet_similarity': ['h5py']} + + +def fetch_imagenet_similarity(data_home: Optional[os.PathLike] = None, download_if_missing: bool = True, + shuffle: bool = True, random_state: Optional[np.random.RandomState] = None, + version: str = '0.1', return_data: bool = False) -> Union[Bunch, np.ndarray]: + """ Load the imagenet similarity dataset (rank 2 from 8). + + =================== ===================== + Trials v0.1/v0.2 25,273 / 384,277 + Objects (Images) 1,000 / 50,000 + Classes 1,000 + Query rank 2 from 8 + =================== ===================== + + See :ref:`imagenet_similarity_dataset` for a detailed description. + + .. Note : + Loading dataset requires the package `h5py`_, which was not installed as an dependency of cblearn. + + .. _`h5py`: https://docs.h5py.org/en/stable/build.html + + >>> dataset = fetch_imagenet_similarity(shuffle=True, version='0.1') # doctest: +REMOTE_DATA + >>> dataset.class_label[[0, -1]].tolist() # doctest: +REMOTE_DATA + ['n01440764', 'n15075141'] + >>> dataset.n_select, dataset.is_ranked # doctest: +REMOTE_DATA + (2, True) + >>> dataset.data.shape # doctest: +REMOTE_DATA + (25273, 9) + + Args: + data_home : optional, default: None + Specify another download and cache folder for the datasets. By default + all scikit-learn data is stored in '~/scikit_learn_data' subfolders. + download_if_missing : optional, default=True + shuffle: default = True + Shuffle the order of triplet constraints. + random_state: optional, default = None + Initialization for shuffle random generator + version: Version of the dataset. + '0.1' contains one object per class, + '0.2' 50 objects per class. + return_triplets : boolean, default=False. + If True, returns numpy array instead of a Bunch object. + + Returns: + dataset : :class:`~sklearn.utils.Bunch` + Dictionary-like object, with the following attributes. + + data : ndarray, shape (n_query, 9) + Each row corresponding a rank-2-of-8 query, entries are object indices. + The first column is the reference, the second column is the most similar, and the + third column is the second most similar object. + rt_ms : ndarray, shape (n_query, ) + Reaction time in milliseconds. + n_select : int + Number of selected objects per trial. + is_ranked : bool + Whether the selection is ranked in similarity to the reference. + session_id : (n_query,) + Ids of the survey session for query recording. + stimulus_id : (n_query,) + Ids of the image stimulus (object). + class_id : (n_query,) + Imagenet class assigned to each image. + class_label : (50000,) + WordNet labels of the classes. + DESCR : string + Description of the dataset. + data : numpy arrays (n_query, 9) + Only present when `return_data=True`. + + Raises: + IOError: If the data is not locally available, but download_if_missing=False + """ + data_home = Path(_base.get_data_home(data_home=data_home)) + if not data_home.exists(): + data_home.mkdir() + + filepath = Path(_base._pkl_filepath(data_home, 'imagenet_similarity.pkz')) + if not filepath.exists(): + if not download_if_missing: + raise IOError("Data not found and `download_if_missing` is False") + + logger.info('Downloading imagenet similarity data from {} to {}'.format(ARCHIVE.url, data_home)) + + archive_path = (ARCHIVE.filename if data_home is None + else join(data_home, ARCHIVE.filename)) + urlretrieve(ARCHIVE.url, archive_path) + + with zipfile.ZipFile(archive_path) as zf: + import h5py + + with zf.open('val/obs/psiz0.4.1/obs-118.hdf5', 'r') as f: + data_v1 = {k: np.asarray(v[()]) for k, v in h5py.File(f, mode='r').items()} + + with zf.open('val/obs/psiz0.4.1/obs-195.hdf5', 'r') as f: + data_v2 = {k: np.asarray(v[()]) for k, v in h5py.File(f, mode='r').items()} + + with zf.open('val/catalogs/psiz0.4.1/catalog.hdf5', 'r') as f: + catalog = {k: np.asarray(v[()]) for k, v in h5py.File(f, mode='r').items()} + + joblib.dump((data_v1, data_v2, catalog), filepath, compress=6) + os.remove(archive_path) + else: + (data_v1, data_v2, catalog) = joblib.load(filepath) + + if str(version) == '0.1': + data = data_v1 + elif str(version) == '0.2': + data = data_v2 + else: + raise ValueError(f"Expects version '0.1' or '0.2', got '{version}'.") + + data.pop('trial_type') + catalog['class_map_label'] = catalog['class_map_label'].astype(str) + + if shuffle: + random_state = check_random_state(random_state) + ix = random_state.permutation(len(data['stimulus_set'])) + data = {k: v[ix] for k, v in data.items()} + + if return_data: + return data['stimulus_set'] + + module_path = Path(__file__).parent + with module_path.joinpath('descr', 'imagenet_similarity.rst').open() as rst_file: + fdescr = rst_file.read() + + return Bunch(data=data['stimulus_set'], + rt_ms=data['rt_ms'], + n_select=int(np.unique(data['n_select'])), + is_ranked=bool(np.unique(data['is_ranked'])), + session_id=data['session_id'], + stimulus_id=catalog['stimulus_id'], + class_id=catalog['class_id'], + class_label=catalog['class_map_label'][1:], + DESCR=fdescr) diff --git a/cblearn/datasets/_material_similarity.py b/cblearn/datasets/_material_similarity.py new file mode 100644 index 0000000..d491355 --- /dev/null +++ b/cblearn/datasets/_material_similarity.py @@ -0,0 +1,143 @@ +from pathlib import Path +import logging +import joblib +import json +import os +from typing import Optional, Union +import zipfile + +import numpy as np +from sklearn.datasets import _base +from sklearn.utils import check_random_state, Bunch +from cblearn.utils import check_triplet_answers + + +ARCHIVE = _base.RemoteFileMetadata( + filename='material-appearance-similarity-master.zip', + url='https://github.com/mlagunas/material-appearance-similarity/archive/refs/heads/master.zip', + checksum=('f0be4d573829fd5e5a7e7b332989545cbf6584eaf25e2555371703a9264f5937')) + +logger = logging.getLogger(__name__) + + +def fetch_material_similarity(data_home: Optional[os.PathLike] = None, download_if_missing: bool = True, + shuffle: bool = True, random_state: Optional[np.random.RandomState] = None, + return_triplets: bool = False) -> Union[Bunch, np.ndarray]: + """ Load the material similarity dataset (triplets). + + =================== ===================== + Triplets Train/Test 22801 / 3000 + Responses 92892 / 11800 + Objects (Materials) 100 + =================== ===================== + + See :ref:`material_similarity_dataset` for a detailed description. + + >>> dataset = fetch_material_similarity(shuffle=True) # doctest: +REMOTE_DATA + >>> dataset.material_name[[0, -1]].tolist() # doctest: +REMOTE_DATA + ['alum-bronze', 'yellow-plastic'] + >>> dataset.triplet.shape, dataset.response.shape # doctest: +REMOTE_DATA + ((92892, 3), (92892,)) + + Args: + data_home : optional, default: None + Specify another download and cache folder for the datasets. By default + all scikit-learn data is stored in '~/scikit_learn_data' subfolders. + download_if_missing : optional, default=True + shuffle: default = True + Shuffle the order of triplet constraints. + random_state: optional, default = None + Initialization for shuffle random generator + return_triplets : boolean, default=False. + If True, returns numpy array instead of a Bunch object. + Returns: + dataset : :class:`~sklearn.utils.Bunch` + Dictionary-like object, with the following attributes. + + triplet : ndarray, shape (n_triplets, 3) + Each row corresponding a triplet constraint. + The columns represent the reference and two other material indices. + response : ndarray, shape (n_triplets, ) + The count of subject responses that chose the first other (positive) or second other (negative) + material to be more similar to the reference material. + test_triplet : ndarray, shape (n_test_triplets, 3) + handoff test set. + test_response : ndarray, shape (n_test_triplets, ) + handoff test set. + material_name : ndarray, shape (100, ) + Names of the materials. + DESCR : string + Description of the dataset. + triplets, response : numpy arrays (n_triplets, 3) and (n_triplets, ) + Only present when `return_triplets=True`. + Raises: + IOError: If the data is not locally available, but download_if_missing=False + """ + + data_home = Path(_base.get_data_home(data_home=data_home)) + if not data_home.exists(): + data_home.mkdir() + + filepath = Path(_base._pkl_filepath(data_home, 'material_similarity.pkz')) + if not filepath.exists(): + if not download_if_missing: + raise IOError("Data not found and `download_if_missing` is False") + + logger.info('Downloading material similarity data from {} to {}'.format(ARCHIVE.url, data_home)) + + archive_path = _base._fetch_remote(ARCHIVE, dirname=data_home) + with zipfile.ZipFile(archive_path) as zf: + with zf.open('material-appearance-similarity-master/data/answers_processed_test.json', 'r') as f: + test_data = json.load(f) + with zf.open('material-appearance-similarity-master/data/answers_processed_train.json', 'r') as f: + train_data = json.load(f) + + image_path = 'material-appearance-similarity-master/data/havran1_ennis_298x298_LDR/' + material_names = np.asarray([name[len(image_path):-len('.jpg')] for name in zf.namelist() + if name.startswith(image_path) and name.endswith('.jpg')]) + material_names.sort() + joblib.dump((train_data, test_data, material_names), filepath, compress=6) + os.remove(archive_path) + else: + (train_data, test_data, material_names) = joblib.load(filepath) + + train_triplets = np.array(train_data['answers']) + train_agreement = np.array(train_data['agreement']) + train_triplets_1, train_response_1 = check_triplet_answers(train_triplets[train_agreement[:, 0] > 0], + train_agreement[train_agreement[:, 0] > 0][:, 0], + result_format='list-count') + train_triplets_2, train_response_2 = check_triplet_answers(train_triplets[train_agreement[:, 1] > 0], + train_agreement[train_agreement[:, 1] > 0][:, 1], + result_format='list-count') + train_triplets, train_response = np.r_[train_triplets_1, train_triplets_2], np.r_[train_response_1, train_response_2] + + test_triplets = np.array(test_data['answers']) + test_agreement = np.array(test_data['agreement']) + test_triplets_1, test_response_1 = check_triplet_answers(test_triplets[test_agreement[:, 0] > 0], + test_agreement[test_agreement[:, 0] > 0][:, 0], + result_format='list-count') + test_triplets_2, test_response_2 = check_triplet_answers(test_triplets[test_agreement[:, 1] > 0], + test_agreement[test_agreement[:, 1] > 0][:, 1], + result_format='list-count') + test_triplets, test_response = np.r_[test_triplets_1, test_triplets_2], np.r_[test_response_1, test_response_2] + + if shuffle: + random_state = check_random_state(random_state) + train_ix = random_state.permutation(len(train_triplets)) + train_triplets, train_response = train_triplets[train_ix], train_response[train_ix] + test_ix = random_state.permutation(len(test_triplets)) + test_triplets, test_response = test_triplets[test_ix], test_response[test_ix] + + if return_triplets: + return train_triplets, train_response + + module_path = Path(__file__).parent + with module_path.joinpath('descr', 'material_similarity.rst').open() as rst_file: + fdescr = rst_file.read() + + return Bunch(triplet=train_triplets, + response=train_response, + test_triplet=test_triplets, + test_response=test_response, + material_name=material_names, + DESCR=fdescr) diff --git a/cblearn/datasets/_musician_similarity.py b/cblearn/datasets/_musician_similarity.py index d7a1b05..edd3ae5 100644 --- a/cblearn/datasets/_musician_similarity.py +++ b/cblearn/datasets/_musician_similarity.py @@ -1,9 +1,9 @@ +import csv from pathlib import Path import logging import joblib import os from typing import Optional, Union -import warnings import numpy as np from sklearn.datasets import _base @@ -24,7 +24,7 @@ def fetch_musician_similarity(data_home: Optional[os.PathLike] = None, download_ """ Load the MusicSeer musician similarity dataset (triplets). =================== ===================== - Triplets 213629 + Triplets 224792 Objects (Musicians) 413 Dimensionality unknown =================== ===================== @@ -79,12 +79,10 @@ def fetch_musician_similarity(data_home: Optional[os.PathLike] = None, download_ logger.info('Downloading musician similarity from {} to {}'.format(ARCHIVE.url, data_home)) archive_path = _base._fetch_remote(ARCHIVE, dirname=data_home) - data_dtype = {'names': ('judgement', 'survey', 'user', 'target', 'chosen', 'other'), - 'formats': (' Union[Bunch, np.ndarray]: + """ Load the nature scene similarity dataset (odd-one-out). + + =================== ===================== + Triplets 3355 + Objects (Scenes) 120 + =================== ===================== + + See :ref:`nature_vogue_dataset` for a detailed description. + + >>> dataset = fetch_nature_scene_similarity(shuffle=True) # doctest: +REMOTE_DATA + >>> dataset.image_label[[0, -1]].tolist() # doctest: +REMOTE_DATA + ['art114.jpg', 'n344019.jpg'] + >>> dataset.triplet.shape # doctest: +REMOTE_DATA + (3355, 3) + + Args: + data_home : optional, default: None + Specify another download and cache folder for the datasets. By default + all scikit-learn data is stored in '~/scikit_learn_data' subfolders. + download_if_missing : optional, default=True + shuffle: default = True + Shuffle the order of triplet constraints. + random_state: optional, default = None + Initialization for shuffle random generator + return_triplets : boolean, default=False. + If True, returns numpy array instead of a Bunch object. + + Returns: + dataset : :class:`~sklearn.utils.Bunch` + Dictionary-like object, with the following attributes. + + triplet : ndarray, shape (n_triplets, 3) + Each row corresponding odd-one-out query. + The columns represent the odd image and two others. + class_label : ndarray, shape (120, ) + Names of the scene images. + DESCR : string + Description of the dataset. + triplets : numpy arrays (n_triplets, 3) + Only present when `return_triplets=True`. + + Raises: + IOError: If the data is not locally available, but download_if_missing=False + """ + return _fetch_nature_vogue('nature', data_home, download_if_missing, shuffle, random_state, return_triplets) + + +def fetch_vogue_cover_similarity(data_home: Optional[os.PathLike] = None, download_if_missing: bool = True, + shuffle: bool = True, random_state: Optional[np.random.RandomState] = None, + return_triplets: bool = False) -> Union[Bunch, np.ndarray]: + """ Load the vogue cover similarity dataset (odd-one-out). + + =================== ===================== + Triplets 1107 + Objects (Covers) 60 + =================== ===================== + + See :ref:`nature_vogue_dataset` for a detailed description. + + >>> dataset = fetch_vogue_cover_similarity(shuffle=True) # doctest: +REMOTE_DATA + >>> dataset.image_label[[0, -1]].tolist() # doctest: +REMOTE_DATA + ['Cover_uk_VOgue_MAY10_V_29mar10_bt_268x353.jpg', 'voguecoverapr11_bt_268x353.jpg'] + >>> dataset.triplet.shape # doctest: +REMOTE_DATA + (1107, 3) + + Args: + data_home : optional, default: None + Specify another download and cache folder for the datasets. By default + all scikit-learn data is stored in '~/scikit_learn_data' subfolders. + download_if_missing : optional, default=True + shuffle: default = True + Shuffle the order of triplet constraints. + random_state: optional, default = None + Initialization for shuffle random generator + return_triplets : boolean, default=False. + If True, returns numpy array instead of a Bunch object. + + Returns: + dataset : :class:`~sklearn.utils.Bunch` + Dictionary-like object, with the following attributes. + + triplet : ndarray, shape (n_triplets, 3) + Each row corresponding odd-one-out query. + The columns represent the odd image and two others. + class_label : ndarray, shape (120, ) + Names of the scene images. + DESCR : string + Description of the dataset. + triplets : numpy arrays (n_triplets, 3) + Only present when `return_triplets=True`. + + Raises: + IOError: If the data is not locally available, but download_if_missing=False + """ + return _fetch_nature_vogue('vogue', data_home, download_if_missing, shuffle, random_state, return_triplets) + + +def _fetch_nature_vogue(dataset: str, data_home: Optional[os.PathLike] = None, download_if_missing: bool = True, + shuffle: bool = True, random_state: Optional[np.random.RandomState] = None, + return_triplets: bool = False) -> Union[Bunch, np.ndarray]: + + data_home = Path(_base.get_data_home(data_home=data_home)) + if not data_home.exists(): + data_home.mkdir() + + filepath = Path(_base._pkl_filepath(data_home, 'nature_vogue_similarity.pkz')) + if not filepath.exists(): + if not download_if_missing: + raise IOError("Data not found and `download_if_missing` is False") + + logger.info('Downloading material similarity data from {} to {}'.format(ARCHIVE.url, data_home)) + + archive_path = _base._fetch_remote(ARCHIVE, dirname=data_home) + with zipfile.ZipFile(archive_path) as zf: + with zf.open('nature/nature_triplets.txt', 'r') as f: + nature_data = np.loadtxt(f, dtype=str) + + with zf.open('vogue/vogue_triplets.txt', 'r') as f: + vogue_data = np.loadtxt(f, dtype=str) + + joblib.dump((nature_data, vogue_data), filepath, compress=6) + os.remove(archive_path) + else: + (nature_data, vogue_data) = joblib.load(filepath) + + if dataset == 'nature': + data = nature_data[:, [2, 0, 1]] + elif dataset == 'vogue': + data = vogue_data[:, [2, 0, 1]] + + triplets, encoder = query_from_columns(data, [0, 1, 2], return_transformer=True) + classes = encoder.encoder_.classes_ + + if shuffle: + random_state = check_random_state(random_state) + triplets = random_state.permutation(triplets) + + if return_triplets: + return triplets + + module_path = Path(__file__).parent + with module_path.joinpath('descr', 'nature_vogue_similarity.rst').open() as rst_file: + fdescr = rst_file.read() + + return Bunch(triplet=triplets, + image_label=classes, + DESCR=fdescr) diff --git a/cblearn/datasets/_things_similarity.py b/cblearn/datasets/_things_similarity.py new file mode 100644 index 0000000..dec438b --- /dev/null +++ b/cblearn/datasets/_things_similarity.py @@ -0,0 +1,121 @@ +import csv +import io +from pathlib import Path +import logging +import joblib +import os +from os.path import join +from typing import Optional, Union +from urllib.request import urlretrieve +import zipfile + +import numpy as np +from sklearn.datasets import _base +from sklearn.utils import check_random_state, Bunch + +ARCHIVE = _base.RemoteFileMetadata( + filename='osfstorage-archive.zip', + url='https://files.osf.io/v1/resources/z2784/providers/osfstorage/?zip=', + checksum=('cannot check - zip involves randomness')) + +logger = logging.getLogger(__name__) + + +def fetch_things_similarity(data_home: Optional[os.PathLike] = None, download_if_missing: bool = True, + shuffle: bool = True, random_state: Optional[np.random.RandomState] = None, + return_data: bool = False) -> Union[Bunch, np.ndarray]: + """ Load the things similarity dataset (odd-one-out). + + =================== ===================== + Trials 146,012 + Objects (Things) 1,854 + Query 3 images, odd one out + =================== ===================== + + See :ref:`things_similarity_dataset` for a detailed description. + + >>> dataset = fetch_things_similarity(shuffle=True) # doctest: +REMOTE_DATA + >>> dataset.word[[0, -1]].tolist() # doctest: +REMOTE_DATA + ['aardvark', 'zucchini'] + >>> dataset.data.shape # doctest: +REMOTE_DATA + (146012, 3) + + Args: + data_home : optional, default: None + Specify another download and cache folder for the datasets. By default + all scikit-learn data is stored in '~/scikit_learn_data' subfolders. + download_if_missing : optional, default=True + shuffle: default = True + Shuffle the order of triplet constraints. + random_state: optional, default = None + Initialization for shuffle random generator + return_triplets : boolean, default=False. + If True, returns numpy array instead of a Bunch object. + + Returns: + dataset : :class:`~sklearn.utils.Bunch` + Dictionary-like object, with the following attributes. + + data : ndarray, shape (n_query, 3) + Each row corresponding a odd-one-out query, entries are object indices. + The first column is the selected odd-one. + word : (n_objects,) + Single word associated with the thing objects. + synset : (n_objects,) + Wordnet Synset associated with the thing objects. + wordnet_id : (n_objects,) + Wordnet Id associated with the thing objects. + thing_id : (n_objects,) + Unique Id string associated with the thing objects. + DESCR : string + Description of the dataset. + data : numpy arrays (n_query, 3) + Only present when `return_data=True`. + + Raises: + IOError: If the data is not locally available, but download_if_missing=False + """ + data_home = Path(_base.get_data_home(data_home=data_home)) + if not data_home.exists(): + data_home.mkdir() + + filepath = Path(_base._pkl_filepath(data_home, 'things_similarity.pkz')) + if not filepath.exists(): + if not download_if_missing: + raise IOError("Data not found and `download_if_missing` is False") + + logger.info('Downloading imagenet similarity data from {} to {}'.format(ARCHIVE.url, data_home)) + + archive_path = (ARCHIVE.filename if data_home is None + else join(data_home, ARCHIVE.filename)) + urlretrieve(ARCHIVE.url, archive_path) + + with zipfile.ZipFile(archive_path) as zf: + with zf.open('data/data1854_batch5_test10.txt', 'r') as f: + data = np.loadtxt(f, delimiter=' ') + + with zf.open('items1854names.tsv', 'r') as f: + objects = np.array(list(csv.reader(io.TextIOWrapper(f), dialect='excel-tab'))[1:]).T + + joblib.dump((data, objects), filepath, compress=6) + os.remove(archive_path) + else: + (data, objects) = joblib.load(filepath) + + if shuffle: + random_state = check_random_state(random_state) + data = random_state.permutation(data) + + if return_data: + return data + + module_path = Path(__file__).parent + with module_path.joinpath('descr', 'things_similarity.rst').open() as rst_file: + fdescr = rst_file.read() + + return Bunch(data=data, + word=objects[0], + synset=objects[1], + wordnet_id=objects[2], + thing_id=objects[5], + DESCR=fdescr) diff --git a/cblearn/datasets/descr/car_similarity.rst b/cblearn/datasets/descr/car_similarity.rst new file mode 100644 index 0000000..84c2685 --- /dev/null +++ b/cblearn/datasets/descr/car_similarity.rst @@ -0,0 +1,33 @@ +.. _central_car_dataset: + +Car Similarity dataset +----------------------- + +`This dataset contains triplets`_ of 60 car images, responsed in an online survey. +The people chose one car of three, such that the following statement is true: +"Object A is the most central object within the triple of objects (A,B,C)". + +All images were found on Wikimedia Commons and are assigned to one of four classes: +ORDINARY CARS, SPORTS CARS, OFF-ROAD/SPORT UTILITY VEHICLES, and OUTLIERS. + +The corresponding car images are available with the _`full dataset`. +.. _full dataset: http://www.tml.cs.uni-tuebingen.de/team/luxburg/code_and_data/index.php + +**Data Set Characteristics:** + + =================== ===================== + Triplets 7097 + Objects (Cars) 60 + Query 3 cars, most-central + =================== ===================== + +This dataset can be downloaded using the :func:`cblearn.datasets.fetch_car_similarity`. +To use the most-central triplets with e.g. ordinal embedding algorithms, you should convert them to standard triplets +(:func:`cblearn.dataset.triplets_from_mostcentral`). + +Please cite the following paper if you use this dataset in publications. + +.. topic:: References + + - M. Kleindessner and U. von Luxburg. Lens depth function and k-relative neighborhood graph: + Versatile tools for ordinal data analysis. JMLR, 18(58):1–52, 2017. diff --git a/cblearn/datasets/descr/imagenet_similarity.rst b/cblearn/datasets/descr/imagenet_similarity.rst new file mode 100644 index 0000000..da8194c --- /dev/null +++ b/cblearn/datasets/descr/imagenet_similarity.rst @@ -0,0 +1,35 @@ +.. _imagenet_similarity_dataset: + +Imagenet Similarity dataset +--------------------------- + +This dataset contains comparison trials of images from the ImageNet validation dataset (ILSVRC-2012). +In an crowd sourced experiments, subjects ranked two out of 8 images that appeared most similar to a reference image. +The trials where selected in an active learning routine, such that they already are not too dissimilar within a trial. + +There are two versions of this dataset: Version "0.2" has trials for +all 50 ImageNet validation images per class, version "0.1" has trials for a single image per class. + +The whole `dataset`_ is published under CC-By Attribution 4.0 International by Brett Roads. + +.. _dataset: https://osf.io/cn2s3/ + +**Data Set Characteristics:** + + =================== ===================== + Trials v0.1/v0.2 25,273 / 384,277 + Objects (Images) 1,000 / 50,000 + Classes 1,000 + Query rank 2 from 8 + =================== ===================== + +This dataset can be downloaded using the :func:`cblearn.datasets.fetch_imagenet_similarity`. +To use the 8-rank-2 trials with e.g. ordinal embedding algorithms, they can be converted to standard triplets +with :func:`cblearn.preprocessing.triplets_from_multiselect`. + +Please cite the following paper if you use this dataset in publications. + +.. topic:: References + + - Roads, B. D., & Love, B. C. (2020). Enriching ImageNet with Human Similarity Judgments + and Psychological Embeddings. ArXiv:2011.11015 [Cs]. http://arxiv.org/abs/2011.11015 diff --git a/cblearn/datasets/descr/material_similarity.rst b/cblearn/datasets/descr/material_similarity.rst new file mode 100644 index 0000000..a90c309 --- /dev/null +++ b/cblearn/datasets/descr/material_similarity.rst @@ -0,0 +1,30 @@ +.. _material_similarity_dataset: + +Material Similarity dataset +--------------------------- + +This dataset contains triplets of 100 material images, gathered in a crowd sourced experiment. +The subjects chose for triplets of one reference and two candidate images +"Which of these two candidates has a more similar appearance to the reference?". +The trials where actively chosen such that they maximize the information gain (CKL algorithm). + +Experimental code and the material images are available at the dataset author's _`Github repository`. +.. _Github repository: https://github.com/mlagunas/material-appearance-similarity + +**Data Set Characteristics:** + + =================== ===================== + Triplets Train/Test 22801 / 3000 + Responses 92892 / 11800 + Objects (Materials) 100 + =================== ===================== + +This dataset can be downloaded using the :func:`cblearn.datasets.fetch_material_similarity`. +Most triplets where responded multiple times, often contradictory. + +Please cite the following paper if you use this dataset in publications. + +.. topic:: References + + - Lagunas, M., Malpica, S., Serrano, A., Garces, E., Gutierrez, D., & Masia, B. (2019). + A Similarity Measure for Material Appearance. ACM Transactions on Graphics, 38(4), 1–12. diff --git a/cblearn/datasets/descr/musician_similarity.rst b/cblearn/datasets/descr/musician_similarity.rst index ac2738f..461fa9e 100644 --- a/cblearn/datasets/descr/musician_similarity.rst +++ b/cblearn/datasets/descr/musician_similarity.rst @@ -15,7 +15,7 @@ Such, for each user judgement multiple triplets were created with the remaining **Data Set Characteristics:** =================== ===================== - Triplets 213629 + Triplets 224792 Objects (Musicians) 413 Dimensionality unknown =================== ===================== diff --git a/cblearn/datasets/descr/nature_vogue_similarity.rst b/cblearn/datasets/descr/nature_vogue_similarity.rst new file mode 100644 index 0000000..ac5e024 --- /dev/null +++ b/cblearn/datasets/descr/nature_vogue_similarity.rst @@ -0,0 +1,32 @@ +.. _nature_vogue_dataset: + +Nature and Vogue datasets +--------------------------- + +The nature and vogue datasets consist of odd-one-out triplets of the form +"Out of three shown items pick one that appears to be +different from the two others". + +The items were either images of natural scenes (forests, beaches, mountaints, etc.) +or covers of the Vogue magazine. + +**Data Set Characteristics:** + + =================== ===================== + Triplets (Covers) 1107 + Objects (Covers) 60 + Triplets (Scenes) 3355 + Objects (Scenes) 120 + =================== ===================== + +This datasets can be downloaded using :func:`cblearn.datasets.fetch_nature_scene_similarity` and +:func:`cblearn.datasets.fetch_vogue_cover_similarity` +To use the odd-one-out triplets with e.g. ordinal embedding algorithms, convert them to standard triplets +with :func:`cblearn.dataset.triplets_from_oddoneout`. + +Please cite the following paper if you use this dataset in publications. + +.. topic:: References + + - Heikinheimo, H., & Ukkonen, A. (2013). The crowd-median algorithm. + In Proceedings of the AAAI Conference on Human Computation and Crowdsourcing (Vol. 1, No. 1). diff --git a/cblearn/datasets/descr/things_similarity.rst b/cblearn/datasets/descr/things_similarity.rst new file mode 100644 index 0000000..fe78459 --- /dev/null +++ b/cblearn/datasets/descr/things_similarity.rst @@ -0,0 +1,33 @@ +.. _things_similarity_dataset: + +Things Similarity dataset +------------------------- + +`This dataset`_ contains odd-one-out trials of images from the Things image database. +In an crowd sourced experiments, subjects were asked to choose one of three images, that is the odd-one. +Note: The trials used here, are the test trials of the original paper. Their train trials are not published. + +The data is shared under CC-BY-4.0 by Hebart, M. N., Zheng, C. Y., Pereira, F., and Baker, C. I. + +.. _This dataset: https://osf.io/z2784/ + + +**Data Set Characteristics:** + + =================== ===================== + Trials 146,012 + Objects (Things) 1,854 + Query 3 images, odd one out + =================== ===================== + +This dataset can be downloaded using the :func:`cblearn.datasets.fetch_things_similarity`. +To use the odd-one-out trials with e.g. ordinal embedding algorithms, they can be converted to standard triplets +with :func:`cblearn.preprocessing.triplets_from_oddoneout`. + +Please cite the following paper if you use this dataset in publications. + +.. topic:: References + + - Hebart, M. N., Zheng, C. Y., Pereira, F., & Baker, C. I. (2020). + Revealing the multidimensional mental representations of natural objects underlying human similarity judgements. + Nature Human Behaviour, 4(11), 1173–1185. https://doi.org/10.1038/s41562-020-00951-3 diff --git a/cblearn/datasets/tests/test_food_similarity.py b/cblearn/datasets/tests/test_food_similarity.py index c3e4fa6..c02325a 100644 --- a/cblearn/datasets/tests/test_food_similarity.py +++ b/cblearn/datasets/tests/test_food_similarity.py @@ -4,7 +4,7 @@ from cblearn.datasets import fetch_food_similarity -@pytest.mark.download +@pytest.mark.remote_data def test_fetch_food(tmp_path): data_home = tmp_path / 'cblearn_datasets' bunch = fetch_food_similarity(data_home=data_home, shuffle=False) diff --git a/cblearn/datasets/tests/test_musician_similarity.py b/cblearn/datasets/tests/test_musician_similarity.py index b0e3bbc..812b5bf 100644 --- a/cblearn/datasets/tests/test_musician_similarity.py +++ b/cblearn/datasets/tests/test_musician_similarity.py @@ -4,15 +4,15 @@ from cblearn.datasets import fetch_musician_similarity -@pytest.mark.download +@pytest.mark.remote_data def test_fetch_musician_similarity(tmp_path): data_home = tmp_path / 'cblearn_datasets' bunch = fetch_musician_similarity(data_home=data_home, shuffle=False) - assert bunch.data.shape == (213629, 3) - assert bunch.judgement_id.shape == (213629, ) - assert bunch.user.shape == (213629, ) - assert bunch.survey_or_game.shape == (213629, ) + assert bunch.data.shape == (224792, 3) + assert bunch.judgement_id.shape == (224792, ) + assert bunch.user.shape == (224792, ) + assert bunch.survey_or_game.shape == (224792, ) assert bunch.artists.shape == (413, ) assert bunch.artists[bunch.data[0, 0]] == 'queen' diff --git a/cblearn/embedding/wrapper/_mlds.py b/cblearn/embedding/wrapper/_mlds.py index 6a90f6d..0d9c07f 100644 --- a/cblearn/embedding/wrapper/_mlds.py +++ b/cblearn/embedding/wrapper/_mlds.py @@ -9,6 +9,9 @@ from cblearn.embedding.wrapper._r_base import RWrapperMixin +__doctest_requires__ = {'MLDS': ['rpy2']} + + class MLDS(BaseEstimator, TripletEmbeddingMixin, RWrapperMixin): """ A maximum-likelihood difference scaling (MLDS) estimator, wrapping the R implementation. diff --git a/cblearn/embedding/wrapper/_soe.py b/cblearn/embedding/wrapper/_soe.py index 8d4fc72..674db90 100644 --- a/cblearn/embedding/wrapper/_soe.py +++ b/cblearn/embedding/wrapper/_soe.py @@ -9,6 +9,9 @@ from cblearn.embedding.wrapper._r_base import RWrapperMixin +__doctest_requires__ = {'SOE': ['rpy2']} + + class SOE(BaseEstimator, TripletEmbeddingMixin, RWrapperMixin): """ A soft ordinal embedding estimator, wrapping an R implementation. diff --git a/cblearn/preprocessing/__init__.py b/cblearn/preprocessing/__init__.py new file mode 100644 index 0000000..239af5d --- /dev/null +++ b/cblearn/preprocessing/__init__.py @@ -0,0 +1,5 @@ +from cblearn.preprocessing._query import triplets_from_multiselect +from cblearn.preprocessing._query import triplets_from_oddoneout +from cblearn.preprocessing._query import triplets_from_mostcentral + +from cblearn.preprocessing._label import query_from_columns \ No newline at end of file diff --git a/cblearn/preprocessing/_label.py b/cblearn/preprocessing/_label.py new file mode 100644 index 0000000..ccd5ae5 --- /dev/null +++ b/cblearn/preprocessing/_label.py @@ -0,0 +1,187 @@ +from typing import Dict, Union, List, Tuple, Optional + +from sklearn.base import TransformerMixin, BaseEstimator +from sklearn.preprocessing import LabelEncoder, FunctionTransformer +from sklearn.utils import check_array +from sklearn.utils.validation import check_is_fitted +import numpy as np + + +def _unique_firstaxis(X, return_inverse: bool = False): + """ Find sorted, unique array elements. + + Has a (slow) fallback, if the numpy array is mixed typed and + cannot be used with numpy's unique method. + + >>> _unique_firstaxis([[0.1, 'high'], [0.3, 'low'], [0.1, 'high'], [0.4, 'low']]).tolist() + [['0.1', 'high'], ['0.3', 'low'], ['0.4', 'low']] + >>> u, i = _unique_firstaxis([[0.1, 'high'], [0.3, 'low'], [0.1, 'high'], [0.4, 'low']], return_inverse=True) + >>> u.tolist(), i.tolist() + ([['0.1', 'high'], ['0.3', 'low'], ['0.4', 'low']], [0, 1, 0, 2]) + """ + X = check_array(X, dtype=None, ensure_2d=True) + if X.dtype == object: + py_X = tuple(map(tuple, X)) + uniques = sorted(set(py_X)) + if return_inverse: + ix_map = {val: ix for ix, val in enumerate(uniques)} + indices = [ix_map[val] for val in py_X] + return np.array(uniques, dtype=X.dtype), np.array(indices) + else: + return uniques + else: + return np.unique(X, axis=0, return_inverse=return_inverse) + + +class MultiColumnLabelEncoder(LabelEncoder): + """ Encoder for objects that are a combination of labels in multiple columns. + + Extends the function of scikit-learn's label encoder to 2d arrays. + See :class:`sklearn.preprocessing.LabelEncoder` for more information. + + >>> encoder = MultiColumnLabelEncoder() + >>> label_data = [[0.1, 'high'], [0.3, 'low'], [0.1, 'high'], [0.1, 'low']] + >>> encoder.fit(label_data).transform(label_data).tolist() + [0, 2, 0, 1] + >>> encoder.fit_transform(label_data).tolist() + [0, 2, 0, 1] + >>> encoder.inverse_transform([2, 1, 0]).tolist() + [['0.3', 'low'], ['0.1', 'low'], ['0.1', 'high']] + """ + def fit(self, X): + self.classes_ = _unique_firstaxis(X) + return self + + def fit_transform(self, X): + self.classes_, indices = _unique_firstaxis(X, return_inverse=True) + return indices + + def transform(self, X): + # This method is a modified copy of scikit-learn's implementation + # of sklearn.preprocessing.LabelEncoder.transform (3-clause BSC licensed). + check_is_fitted(self) + X = check_array(X, dtype=None) + + if len(X) == 0: + return np.array([]) + + ix = np.empty(len(X), dtype=int) + for i, c in enumerate(self.classes_): + c_ix = np.where((X == c).all(axis=1))[0] + ix[c_ix] = i + return ix + + def inverse_transform(self, X): + return LabelEncoder.inverse_transform(self, X) + + +class SharedColumnEncoder(TransformerMixin, BaseEstimator): + """ Wrapper to share an encoder across all columns. + + >>> encoder = SharedColumnEncoder(LabelEncoder()) + >>> label_data = [[0.1, 0.3, 0.4], [0.4, 0.1, 0.3], [0.5, 0.3, 0.3]] + >>> encoder.fit(label_data).transform(label_data).tolist() + [[0, 1, 2], [2, 0, 1], [3, 1, 1]] + >>> encoder.fit_transform(label_data).tolist() + [[0, 1, 2], [2, 0, 1], [3, 1, 1]] + >>> encoder.inverse_transform([[2, 2], [1, 0], [0, 1]]).tolist() + [[0.4, 0.4], [0.3, 0.1], [0.1, 0.3]] + """ + def __init__(self, encoder): + self.encoder_ = encoder + + def fit(self, X): + X = check_array(X, allow_nd=True, dtype=None) + self.encoder_.fit(X.reshape(-1, *X.shape[2:])) + return self + + def fit_transform(self, X): + X = check_array(X, allow_nd=True, dtype=None) + long_X = self.encoder_.fit_transform(X.reshape(-1, *X.shape[2:])) + return long_X.reshape(X.shape[:2]) + + def transform(self, X): + X = check_array(X, allow_nd=True, dtype=None) + return self.encoder_.transform(X.reshape(-1, *X.shape[2:])).reshape(X.shape[:2]) + + def inverse_transform(self, X): + X = check_array(X, allow_nd=True, dtype=None) + return self.encoder_.inverse_transform(X.reshape(-1, *X.shape[2:])).reshape(X.shape[0], -1) + + def _more_tags(self): + return {} + + +def query_from_columns(data: Union[np.ndarray, "pandas.DataFrame"], # noqa: F821 ignore pandas, not a library dep + query_columns: Union[List[str], List[int]], + response_columns: Optional[Union[List[str], List[int], str, int]] = None, + response_map: Optional[Dict[str, Union[bool, int]]] = None, + return_transformer: bool = False) \ + -> Union[Tuple[np.ndarray, np.ndarray], + Tuple[Tuple[np.ndarray, np.ndarray], Tuple[TransformerMixin, TransformerMixin]]]: + """ Extract queries from objects in columns or dataframes. + + The objects in the column data might be defined by a single or multiple numerical attributes. + Then this function assigns to each object an index and returns query and response based on object indices, + as required by most library functions. + If attributes are non-numeric, consider first encoding them with :class:`sklearn.preprocessing.LabelEncoder`. + + >>> import pandas as pd + >>> frame = pd.DataFrame({'alpha1': [0.1, 0.7, 0.1], 'tau1': [0, 0, 1], + ... 'alpha2': [0.3, 0.3, 0.7], 'tau2': [1, 0, 0], + ... 'alpha3': [0.7, 0.3, 0.7], 'tau3': [0, 1, 0], 'Response': [1, 0, 0]}) + >>> q, r = query_from_columns(frame, ['alpha1', 'alpha2', 'alpha3'], 'Response', response_map={1: True, 0: False}) + >>> q.tolist(), r.tolist() + ([[0, 1, 2], [2, 1, 1], [0, 2, 2]], [True, False, False]) + >>> q, r = query_from_columns(np.array(frame), [0, 2, 4], response_columns=-1, response_map={1: True, 0: False}) + >>> q.tolist(), r.tolist() + ([[0, 1, 2], [2, 1, 1], [0, 2, 2]], [True, False, False]) + >>> q, r = query_from_columns(frame, [('alpha1', 'tau1'), ('alpha2', 'tau2'), ('alpha3', 'tau3')], + ... response_columns='Response', response_map={1: True, 0: False}) + >>> q.tolist(), r.tolist() + ([[0, 3, 4], [4, 2, 3], [1, 4, 4]], [True, False, False]) + + The transformers can be used to get object attributes from the object index. + + >>> (q,r), (q_transform, r_transform) = query_from_columns( + ... np.array(frame), [0, 2, 4], -1, {1: True, 0: False}, return_transformer=True) + >>> q_transform.inverse_transform(q).tolist() + [[0.1, 0.3, 0.7], [0.7, 0.3, 0.3], [0.1, 0.7, 0.7]] + + + Args: + data: Tabular query representation (n_queries, n_columns) + query_columns: Indices or column-labels in data per query entry. + Columns can be grouped as tuples, if multiple columns define an object. + response_columns: Indices or column-labels in data per response entry. + response_map: Dictionary mapping the response entries in data to {-1, 1} or {False, True}. + return_transformer: If true, transformer objects for the query and response are returned. + Returns: + Tuple with arrays for the queries and responses. + + If return_transform=True, an additional tuple with transformer objects is returned. + + """ + if not hasattr(data, 'columns'): # is no pandas Dataframe? + data = check_array(data, dtype=None).T + query_data = np.swapaxes(np.stack([data[np.array(c)] for c in query_columns]), 0, 1) + if len(query_data.shape) == 3: + query_enc = SharedColumnEncoder(MultiColumnLabelEncoder()) + else: + query_enc = SharedColumnEncoder(LabelEncoder()) + query = query_enc.fit_transform(query_data) + + if response_columns: + inverse_map = {v: k for k, v in response_map.items()} + response_enc = FunctionTransformer(func=np.vectorize(response_map.get), + inverse_func=np.vectorize(inverse_map.get), check_inverse=False) + response = response_enc.fit_transform(data[response_columns]) + if return_transformer: + return (query, response), (query_enc, response_enc) + else: + return query, response + else: + if return_transformer: + return query, query_enc + else: + return query diff --git a/cblearn/preprocessing/_query.py b/cblearn/preprocessing/_query.py new file mode 100644 index 0000000..4512cec --- /dev/null +++ b/cblearn/preprocessing/_query.py @@ -0,0 +1,100 @@ +from itertools import permutations +from typing import Union, Optional + +import numpy as np +from sklearn.utils import check_array, check_X_y + + +def triplets_from_multiselect(X: np.ndarray, select: Union[np.ndarray, int], is_ranked: bool) -> np.ndarray: + """ Calculate triplets from n-select or n-rank queries. + + The n-select query consists of :math:`k>2` object indices. + The first index indicates the pivot object. The selected objects + should be more similar to the pivot, than the other objects. + The selected objects themself can be ordered in similarity to the pivot (`is_ranked=True`). + + Triplets are a special case of n-select queries, with a single other object. + + Args: + X: n-select or n-rank query as array (n_query, n_choices) + select: Integer of first n selected columns or a 2d array (n_query, n_select) + of column indices in X (0..n_choices). + is_ranked: If true, assumes that the selected objects are ordered by their similarity. + Return: + triplets: Array of triplet queries + (n_query * (2 * (n_choices - n_select - 1) + int(is_ranked)), 3) + """ + if isinstance(select, int): + X = check_array(X) + n_select = select + else: + unordered_X, y = check_X_y(X, select, multi_output=True) + n_select = y.shape[1] + all_rows = np.arange(unordered_X.shape[0]) + other_mask = np.ones_like(unordered_X, dtype=bool) + other_mask[:, 0] = False # pivot column + X = np.array(unordered_X) + for col, selected_col in enumerate(y.T): + other_mask[all_rows, selected_col] = False + X[all_rows, col + 1] = unordered_X[all_rows, selected_col] + X[all_rows, (n_select + 1):] = unordered_X[other_mask].reshape(X.shape[0], -1) + + n_trials, n_stimuli = X.shape + ix_array = np.array([ + [0, ix_select, ix_other] + for ix_select in range(1, n_select + 1) + for ix_other in range(ix_select + 1 if is_ranked else n_select + 1, n_stimuli) + ]) + return np.concatenate(X[:, ix_array]) + + +def triplets_from_oddoneout(X: np.ndarray, y: Optional[np.ndarray] = None) -> np.ndarray: + """ Calculates triplets from odd-one-out queries. + + The odd-one-out query consists of k objects, of which one + is most dissimilar to all others. + + .. Note:: + For this transformation, we are assuming that the objects respect the triangle inequality. + This might not always be a given and is not checked by this function. + + Args: + X: Array of odd-one-out queries (n_query, n_choices) + y: Optional list of indices, that indicate the odd choice per query. + If omitted, the first entry is assumed to be the odd object. + Returns: + triplets: Array of triplet queries (n_query * (n_choices - 2) * (n_choices - 1), 3) + """ + if y is None: + y = np.zeros(len(X), dtype=int) + + X, y = check_X_y(X, y) + mask = np.zeros_like(X, dtype=bool) + mask[np.arange(X.shape[0]), y] = True + far = X[mask] + others = X[~mask].reshape(X.shape[0], X.shape[1] - 1) + triplets = [] + for other_ix in permutations(np.arange(others.shape[1]), 2): + triplets.append(np.c_[others[:, other_ix], far]) + return np.row_stack(triplets) + + +def triplets_from_mostcentral(X: np.ndarray, y: Optional[np.ndarray] = None) -> np.ndarray: + """ Calculates triplets from most-central queries. + + The most-central query consists of k objects, of which one + is most similar to all others. + + .. Note:: + For this transformation, we are assuming that the objects respect the triangle inequality. + This might not always be a given and is not checked by this function. + + Args: + X: Array of most-central queries (n_query, n_choices) + y: Optional list of indices, that indicate the central choice per query. + If omitted, the first entry is assumed to be the odd object. + Returns: + triplets: Array of triplet queries (n_query * (n_choices - 2) * (n_choices - 1), 3) + """ + triplets = triplets_from_oddoneout(X, y) + return triplets[:, [0, 2, 1]] diff --git a/cblearn/preprocessing/tests/__init__.py b/cblearn/preprocessing/tests/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/cblearn/preprocessing/tests/test_query.py b/cblearn/preprocessing/tests/test_query.py new file mode 100644 index 0000000..42b66f4 --- /dev/null +++ b/cblearn/preprocessing/tests/test_query.py @@ -0,0 +1,73 @@ +import numpy as np + +from cblearn.preprocessing import triplets_from_multiselect +from cblearn.preprocessing import triplets_from_oddoneout +from cblearn.preprocessing import triplets_from_mostcentral + + +def test_triplets_from_multiselect(): + multi_data = np.asarray([[0, 1, 2, 3, 4, 5, 6], + [10, 11, 12, 13, 14, 15, 16]]) + n_select = 2 + selected = np.asarray([[2, 1], [1, 2]]) + + # Use column data to mark selection + triplets = triplets_from_multiselect(multi_data, n_select, is_ranked=False) + assert triplets.shape == (len(multi_data) * (4 + 4), 3) + assert np.isin(triplets[:, 1], [1, 2, 11, 12]).all() + assert np.isin(triplets[:, 2], multi_data[:, 3:]).all() + + triplets = triplets_from_multiselect(multi_data, n_select, is_ranked=True) + assert triplets.shape == (len(multi_data) * (5 + 4), 3) + assert np.isin(triplets[:, 1], [1, 2, 11, 12]).all() + assert np.isin(triplets[:, 2], multi_data[:, 2:]).all() + + # Use multiple labels to mark selection + triplets = triplets_from_multiselect(multi_data, selected, is_ranked=False) + assert triplets.shape == (len(multi_data) * (4 + 4), 3) + assert np.isin(triplets[:, 1], [1, 2, 11, 12]).all() + assert np.isin(triplets[:, 2], multi_data[:, 3:]).all() + + triplets = triplets_from_multiselect(multi_data, selected, is_ranked=True) + print(triplets) + assert triplets.shape == (len(multi_data) * (5 + 4), 3) + assert np.isin(triplets[:, 1], [1, 2, 11, 12]).all() + assert np.isin(triplets[:, 2], [1, 3, 4, 5, 6, 12, 13, 14, 15, 16]).all() + + +def test_triplets_from_oddoneout(): + oddone_data = np.asarray([[0, 1, 2, 3], + [5, 6, 7, 8]]) + selected = np.asarray([0, 2]) + n_triplets = len(oddone_data) * (4 - 1) * (4 - 2) + + triplets = triplets_from_oddoneout(oddone_data) # select first column + assert triplets.shape == (n_triplets, 3) + np.testing.assert_equal(np.unique(triplets[:, 0]), [1, 2, 3, 6, 7, 8]) + np.testing.assert_equal(np.unique(triplets[:, 1]), [1, 2, 3, 6, 7, 8]) + np.testing.assert_equal(np.unique(triplets[:, 2]), [0, 5]) + + triplets = triplets_from_oddoneout(oddone_data, selected) + assert triplets.shape == (n_triplets, 3) + np.testing.assert_equal(np.unique(triplets[:, 0]), [1, 2, 3, 5, 6, 8]) + np.testing.assert_equal(np.unique(triplets[:, 1]), [1, 2, 3, 5, 6, 8]) + np.testing.assert_equal(np.unique(triplets[:, 2]), [0, 7]) + + +def test_triplets_from_mostcentral(): + mostcentral_data = np.asarray([[0, 1, 2, 3], + [5, 6, 7, 8]]) + selected = np.asarray([0, 2]) + n_triplets = len(mostcentral_data) * (4 - 1) * (4 - 2) + + triplets = triplets_from_mostcentral(mostcentral_data) # select first column + assert triplets.shape == (n_triplets, 3) + np.testing.assert_equal(np.unique(triplets[:, 0]), [1, 2, 3, 6, 7, 8]) + np.testing.assert_equal(np.unique(triplets[:, 1]), [0, 5]) + np.testing.assert_equal(np.unique(triplets[:, 2]), [1, 2, 3, 6, 7, 8]) + + triplets = triplets_from_mostcentral(mostcentral_data, selected) + assert triplets.shape == (n_triplets, 3) + np.testing.assert_equal(np.unique(triplets[:, 0]), [1, 2, 3, 5, 6, 8]) + np.testing.assert_equal(np.unique(triplets[:, 1]), [0, 7]) + np.testing.assert_equal(np.unique(triplets[:, 2]), [1, 2, 3, 5, 6, 8]) diff --git a/docs/references/index.rst b/docs/references/index.rst index 5aacc85..58698d9 100644 --- a/docs/references/index.rst +++ b/docs/references/index.rst @@ -20,8 +20,14 @@ Loaders .. autosummary:: :toctree: generated/ - datasets.fetch_musician_similarity + datasets.fetch_car_similarity datasets.fetch_food_similarity + datasets.fetch_imagenet_similarity + datasets.fetch_nature_scene_similarity + datasets.fetch_material_similarity + datasets.fetch_musician_similarity + datasets.fetch_vogue_cover_similarity + datasets.fetch_things_similarity Simulations @@ -92,6 +98,22 @@ Wrapper metrics.procrustes_distance metrics.TripletScorer +:mod:`cblearn.preprocessing` Preprocessing +========================================== + +.. automodule:: cblearn.preprocessing + +.. currentmodule:: cblearn + +.. autosummary:: + :toctree: generated/ + + preprocessing.query_from_columns + preprocessing.triplets_from_multiselect + preprocessing.triplets_from_oddoneout + preprocessing.triplets_from_mostcentral + + :mod:`cblearn.utils` Utility ============================ diff --git a/docs/user_guide/index.rst b/docs/user_guide/index.rst index 75b7360..e291bd0 100644 --- a/docs/user_guide/index.rst +++ b/docs/user_guide/index.rst @@ -29,3 +29,8 @@ Dataset loading utilities .. include:: ../../cblearn/datasets/descr/musician_similarity.rst .. include:: ../../cblearn/datasets/descr/food_similarity.rst +.. include:: ../../cblearn/datasets/descr/car_similarity.rst +.. include:: ../../cblearn/datasets/descr/imagenet_similarity.rst +.. include:: ../../cblearn/datasets/descr/things_similarity.rst +.. include:: ../../cblearn/datasets/descr/nature_vogue_similarity.rst +.. include:: ../../cblearn/datasets/descr/material_similarity.rst diff --git a/pyproject.toml b/pyproject.toml index 0842354..b14b704 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -2,12 +2,11 @@ requires = ["setuptools", "wheel"] [tool.pytest.ini_options] -addopts = "--doctest-modules -ra --verbose --capture=no" +addopts = "--doctest-plus -ra --verbose --capture=no" doctest_optionflags = "NORMALIZE_WHITESPACE IGNORE_EXCEPTION_DETAIL" testpaths = [ "cblearn", ] markers = [ "sklearn: mark tests for sklearn compatibility.", - "download: mark tests running downloads." ] diff --git a/setup.cfg b/setup.cfg index 6666057..98e6034 100644 --- a/setup.cfg +++ b/setup.cfg @@ -31,7 +31,7 @@ packages = find: zip_safe = False install_requires = numpy>=1.19,<2 - scipy>=1.5,<2 + scipy>=1.6,<2 scikit-learn>=0.23,<1 sparse>=0.11,<1 @@ -48,8 +48,11 @@ wrapper = tests = pytest>=6,<7 pytest-cov>=2.10,<3 + pytest-doctestplus>=0.9,<1 + pytest-remotedata>=0.3,<1 flake8>=3.8,<4 mypy>=0.790 + pandas>=1.1,<1.2 docs = sphinx>=3.2,<4 sphinx_rtd_theme>=0.5,<1