From 05b3bc389bb07cdba329f51266e18cd10982614d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?David-Elias=20K=C3=BCnstle?= Date: Wed, 12 Jun 2024 13:01:39 +0200 Subject: [PATCH] Remove duplicate references (#81) --- cblearn/datasets/_triplet_response.py | 11 +++++ docs/getting_started/index.rst | 2 +- paper/references.bib | 67 +++++++++++++++------------ 3 files changed, 49 insertions(+), 31 deletions(-) diff --git a/cblearn/datasets/_triplet_response.py b/cblearn/datasets/_triplet_response.py index 6d020c4..21e34fa 100644 --- a/cblearn/datasets/_triplet_response.py +++ b/cblearn/datasets/_triplet_response.py @@ -9,6 +9,17 @@ from cblearn.datasets._datatypes import NoiseTarget, Distance +def _count_unique_items(query): + """ Count unique items per row in a 2D array. + + Efficient approach even for large number of rows + and integer items: + https://stackoverflow.com/a/48473125 + """ + sorted_query = np.sort(query, axis=1) + return (sorted_query[:, 1:] != sorted_query[:, :-1]).sum(axis=1) + 1 + + def _count_unique_items(query): """ Count unique items per row in a 2D array. diff --git a/docs/getting_started/index.rst b/docs/getting_started/index.rst index d537e98..262b5c4 100644 --- a/docs/getting_started/index.rst +++ b/docs/getting_started/index.rst @@ -101,4 +101,4 @@ once enough are available. The triplet generator's `result_format` option specifies the expected data format of the triplets, as triplets can be represented in different ways. This example uses the `list-order` format, a list of triplets, containing the indices of an anchor, near, and far point. Learn more about data formats and other aspects of the library in the :ref:`user_guide`. -Alternatively, you can find more code in the :ref:`examples` or get an overview of the :ref:`api`. +Alternatively, you can find more code in the :ref:`examples` or get an overview of the :ref:`api_ref`. diff --git a/paper/references.bib b/paper/references.bib index 536b042..5fa4aa6 100644 --- a/paper/references.bib +++ b/paper/references.bib @@ -24,6 +24,7 @@ @article{fsauerObjectiveMeasurementApproach2024 copyright = {All rights reserved}, langid = {english}, } + @article{huber2024tracing, title={Tracing Truth Through Conceptual Scaling: Mapping People’s Understanding of Abstract Concepts}, author={Huber, Lukas S and K{\"u}nstle, David-Elias and Reuter, Kevin}, @@ -32,8 +33,17 @@ @article{huber2024tracing doi={10.31234/osf.io/c42yr} } - -@article{Sievert2023, doi = {10.21105/joss.04517}, url = {https://doi.org/10.21105/joss.04517}, year = {2023}, publisher = {The Open Journal}, volume = {8}, number = {84}, pages = {4517}, author = {Scott Sievert and Robert Nowak and Timothy Rogers}, title = {Efficiently Learning Relative Similarity Embeddings with Crowdsourcing}, journal = {Journal of Open Source Software} } +@article{Sievert2023, + doi = {10.21105/joss.04517}, + url = {https://doi.org/10.21105/joss.04517}, + year = {2023}, publisher = {The Open Journal}, + volume = {8}, + number = {84}, + pages = {4517}, + author = {Scott Sievert and Robert Nowak and Timothy Rogers}, + title = {Efficiently Learning Relative Similarity Embeddings with Crowdsourcing}, + journal = {Journal of Open Source Software} +} @inproceedings{NIPS2015_89ae0fe2, author = {Jamieson, Kevin G and Jain, Lalit and Fernandez, Chris and Glattard, Nicholas J. and Nowak, Rob}, @@ -47,7 +57,6 @@ @inproceedings{NIPS2015_89ae0fe2 year = {2015} } - @article{vankadara_insights_2020, title = {Insights into {Ordinal} {Embedding} {Algorithms}: {A} {Systematic} {Evaluation}}, shorttitle = {Insights into {Ordinal} {Embedding} {Algorithms}}, @@ -139,9 +148,20 @@ @article{maloney_maximum_2003 } @InProceedings{agarwal_generalized_2007, - title = {Generalized Non-metric Multidimensional Scaling}, - author = {Agarwal, Sameer and Wills, Josh and Cayton, Lawrence and Lanckriet, Gert and Kriegman, David and Belongie, Serge}, booktitle = {Proceedings of the Eleventh International Conference on Artificial Intelligence and Statistics}, pages = {11--18}, year = {2007}, editor = {Meila, Marina and Shen, Xiaotong}, volume = {2}, series = {Proceedings of Machine Learning Research}, address = {San Juan, Puerto Rico}, month = {21--24 Mar}, publisher = {PMLR}, pdf = {http://proceedings.mlr.press/v2/agarwal07a/agarwal07a.pdf}, url = {https://proceedings.mlr.press/v2/agarwal07a.html}} - + title = {Generalized Non-metric Multidimensional Scaling}, + author = {Agarwal, Sameer and Wills, Josh and Cayton, Lawrence and Lanckriet, Gert and Kriegman, David and Belongie, Serge}, + booktitle = {Proceedings of the Eleventh International Conference on Artificial Intelligence and Statistics}, + pages = {11--18}, + year = {2007}, + editor = {Meila, Marina and Shen, Xiaotong}, + volume = {2}, + series = {Proceedings of Machine Learning Research}, + address = {San Juan, Puerto Rico}, + month = {21--24 Mar}, + publisher = {PMLR}, + pdf = {http://proceedings.mlr.press/v2/agarwal07a/agarwal07a.pdf}, + url = {https://proceedings.mlr.press/v2/agarwal07a.html} +} @inproceedings{terada_local_2014, title = {Local ordinal embedding}, @@ -226,14 +246,7 @@ @inproceedings{kingma2014adam biburl = {https://dblp.org/rec/journals/corr/KingmaB14.bib}, bibsource = {dblp computer science bibliography, https://dblp.org} } -@inproceedings{heikinheimo2013crowd, - title={The crowd-median algorithm}, - author={Heikinheimo, Hannes and Ukkonen, Antti}, - booktitle={Proceedings of the AAAI Conference on Human Computation and Crowdsourcing}, - volume={1}, - pages={69--77}, - year={2013} -} + @article{paszke2019pytorch, title={Pytorch: An imperative style, high-performance deep learning library}, author={Paszke, Adam and Gross, Sam and Massa, Francisco and Lerer, Adam and Bradbury, James and Chanan, Gregory and Killeen, Trevor and Lin, Zeming and Gimelshein, Natalia and Antiga, Luca and others}, @@ -241,6 +254,7 @@ @article{paszke2019pytorch volume={32}, year={2019} } + @inproceedings{anselPyTorchFasterMachine2024, title = {{{PyTorch}} 2: {{Faster Machine Learning Through Dynamic Python Bytecode Transformation}} and {{Graph Compilation}}}, shorttitle = {{{PyTorch}} 2}, @@ -256,6 +270,7 @@ @inproceedings{anselPyTorchFasterMachine2024 isbn = {9798400703850}, langid = {english}, } + @article{virtanenSciPyFundamentalAlgorithms2020, title = {{{SciPy}} 1.0: Fundamental Algorithms for Scientific Computing in {{Python}}}, shorttitle = {{{SciPy}} 1.0}, @@ -275,6 +290,7 @@ @article{virtanenSciPyFundamentalAlgorithms2020 langid = {english}, keywords = {Biophysical chemistry,Computational biology and bioinformatics,Technology}, } + @article{harris_array_2020, title = {Array Programming with {{NumPy}}}, author = {Harris, Charles R. and Millman, K. Jarrod and {van der Walt}, St{\'e}fan J. and Gommers, Ralf and Virtanen, Pauli and Cournapeau, David and Wieser, Eric and Taylor, Julian and Berg, Sebastian and Smith, Nathaniel J. and Kern, Robert and Picus, Matti and Hoyer, Stephan and {van Kerkwijk}, Marten H. and Brett, Matthew and Haldane, Allan and {del R{\'i}o}, Jaime Fern{\'a}ndez and Wiebe, Mark and Peterson, Pearu and {G{\'e}rard-Marchant}, Pierre and Sheppard, Kevin and Reddy, Tyler and Weckesser, Warren and Abbasi, Hameer and Gohlke, Christoph and Oliphant, Travis E.}, @@ -289,6 +305,7 @@ @article{harris_array_2020 langid = {english}, keywords = {Computational neuroscience,Computational science,Computer science,Software,Solar physics} } + @article{hebartRevealingMultidimensionalMental2020, title = {Revealing the multidimensional mental representations of natural objects underlying human similarity judgements}, volume = {4}, @@ -379,6 +396,7 @@ @article{buitinck_api_2013 journal={arXiv:1309.0238 [cs.LG]}, doi = {10.48550/arXiv.1309.0238}, } + @inproceedings{ghoshdastidar_foundations_2019, title = {Foundations of {Comparison}-{Based} {Hierarchical} {Clustering}}, abstract = {We address the classical problem of hierarchical clustering, but in a framework where one does not have access to a representation of the objects or their pairwise similarities. Instead, we assume that only a set of comparisons between objects is available, that is, statements of the form objects i and j are more similar than objects k and l.'' Such a scenario is commonly encountered in crowdsourcing applications. The focus of this work is to develop comparison-based hierarchical clustering algorithms that do not rely on the principles of ordinal embedding. We show that single and complete linkage are inherently comparison-based and we develop variants of average linkage. We provide statistical guarantees for the different methods under a planted hierarchical partition model. We also empirically demonstrate the performance of the proposed approaches on several datasets.}, @@ -387,21 +405,6 @@ @inproceedings{ghoshdastidar_foundations_2019 year = {2019}, } -@article{harris_array_2020, - title = {Array programming with {NumPy}}, - volume = {585}, - copyright = {2020 The Author(s)}, - abstract = {Array programming provides a powerful, compact and expressive syntax for accessing, manipulating and operating on data in vectors, matrices and higher-dimensional arrays. NumPy is the primary array programming library for the Python language. It has an essential role in research analysis pipelines in fields as diverse as physics, chemistry, astronomy, geoscience, biology, psychology, materials science, engineering, finance and economics. For example, in astronomy, NumPy was an important part of the software stack used in the discovery of gravitational waves1 and in the first imaging of a black hole2. Here we review how a few fundamental array concepts lead to a simple and powerful programming paradigm for organizing, exploring and analysing scientific data. NumPy is the foundation upon which the scientific Python ecosystem is constructed. It is so pervasive that several projects, targeting audiences with specialized needs, have developed their own NumPy-like interfaces and array objects. Owing to its central position in the ecosystem, NumPy increasingly acts as an interoperability layer between such array computation libraries and, together with its application programming interface (API), provides a flexible framework to support the next decade of scientific and industrial analysis.}, - language = {en}, - number = {7825}, - journal = {Nature}, - author = {Harris, Charles R. and Millman, K. Jarrod and van der Walt, Stéfan J. and Gommers, Ralf and Virtanen, Pauli and Cournapeau, David and Wieser, Eric and Taylor, Julian and Berg, Sebastian and Smith, Nathaniel J. and Kern, Robert and Picus, Matti and Hoyer, Stephan and van Kerkwijk, Marten H. and Brett, Matthew and Haldane, Allan and del Río, Jaime Fernández and Wiebe, Mark and Peterson, Pearu and Gérard-Marchant, Pierre and Sheppard, Kevin and Reddy, Tyler and Weckesser, Warren and Abbasi, Hameer and Gohlke, Christoph and Oliphant, Travis E.}, - year = {2020}, - keywords = {Computational neuroscience, Computational science, Computer science, Software, Solar physics}, - pages = {357--362}, - doi = {10.1038/s41586-020-2649-2}, -} - @inproceedings{perrot_near-optimal_2020, title = {Near-optimal comparison based clustering}, booktitle = {Advances in Neural Information Processing Systems (NeurIPS)}, @@ -409,6 +412,7 @@ @inproceedings{perrot_near-optimal_2020 editor = {Larochelle, H. and Ranzato, M. and Hadsell, R. and Balcan, M.F. and Lin, H.}, year = {2020}, } + @inproceedings{heikinheimo2013crowd, title={The crowd-median algorithm}, author={Heikinheimo, Hannes and Ukkonen, Antti}, @@ -433,6 +437,7 @@ @InProceedings{amid2015 publisher = {PMLR}, abstract = {For humans, it is usually easier to make statements about the similarity of objects in relative, rather than absolute terms. Moreover, subjective comparisons of objects can be based on a number of different and independent attributes. For example, objects can be compared based on their shape, color, etc. In this paper, we consider the problem of uncovering these hidden attributes given a set of relative distance judgments in the form of triplets. The attribute that was used to generate a particular triplet in this set is unknown. Such data occurs, e.g., in crowdsourcing applications where the triplets are collected from a large group of workers. We propose the Multiview Triplet Embedding (MVTE) algorithm that produces a number of low-dimensional maps, each corresponding to one of the hidden attributes. The method can be used to assess how many different attributes were used to create the triplets, as well as to assess the difficulty of a distance comparison task, and find objects that have multiple interpretations in relation to the other objects.} } + @inproceedings{balcan2016learning, title={Learning combinatorial functions from pairwise comparisons}, author={Balcan, Maria-Florina and Vitercik, Ellen and White, Colin}, @@ -441,6 +446,7 @@ @inproceedings{balcan2016learning year={2016}, organization={PMLR} } + @inproceedings{anderton2019scaling, title={Scaling up ordinal embedding: A landmark approach}, author={Anderton, Jesse and Aslam, Javed}, @@ -449,6 +455,7 @@ @inproceedings{anderton2019scaling year={2019}, organization={PMLR} } + @inproceedings{bower2018landscape, title={The landscape of non-convex quadratic feasibility}, author={Bower, Amanda and Jain, Lalit and Balzano, Laura}, @@ -458,6 +465,7 @@ @inproceedings{bower2018landscape organization={IEEE}, doi={10.1109/icassp.2018.8461868} } + @inproceedings{ghosh2019landmark, title = {Landmark Ordinal Embedding}, booktitle = {Advances in Neural Information Processing Systems}, @@ -468,7 +476,6 @@ @inproceedings{ghosh2019landmark publisher = {{Curran Associates, Inc.}} } - % active OE: @article{sievert2023efficiently, title={Efficiently Learning Relative Similarity Embeddings with Crowdsourcing},