From 8823369b91c82d7641367cfb0b9cdfec075143b1 Mon Sep 17 00:00:00 2001 From: Henil Panchal Date: Sun, 16 Jun 2024 17:43:48 +0530 Subject: [PATCH] docs: Update annotations for tasks (#936) * annottations * annotate * annotate * annotate * annotate * ann * ann * ann * ann * ann * formatted * add data * add data * Update docs/mmteb/points/936.jsonl * lint --------- Co-authored-by: Kenneth Enevoldsen --- docs/mmteb/points.md | 3 ++- docs/mmteb/points/936.jsonl | 2 ++ mteb/cli.py | 2 +- .../AmazonReviewsClassification.py | 18 +++++++-------- .../multilingual/MasakhaNEWSClassification.py | 18 +++++++-------- .../MassiveIntentClassification.py | 2 +- .../MassiveScenarioClassification.py | 2 +- .../Clustering/deu/TenKGnadClusteringP2P.py | 14 ++++++------ .../tasks/Reranking/eng/MindSmallReranking.py | 18 +++++++-------- mteb/tasks/Reranking/zho/CMTEBReranking.py | 22 +++++++++---------- mteb/tasks/Retrieval/eng/HagridRetrieval.py | 22 +++++++++---------- mteb/tasks/Retrieval/fra/AlloprofRetrieval.py | 22 +++++++++---------- mteb/tasks/Retrieval/fra/BSARDRetrieval.py | 22 +++++++++---------- mteb/tasks/Retrieval/fra/SyntecRetrieval.py | 16 +++++++------- 14 files changed, 93 insertions(+), 90 deletions(-) create mode 100644 docs/mmteb/points/936.jsonl diff --git a/docs/mmteb/points.md b/docs/mmteb/points.md index cd4b955b2b..6eec9b76c3 100644 --- a/docs/mmteb/points.md +++ b/docs/mmteb/points.md @@ -92,4 +92,5 @@ Please also add your first name and last name are as you want them to appear in | ManuelFay | Manuel | Faysse | manuel.faysse@centralesupelec.fr | ~Manuel_Faysse1 | CentraleSupélec & Illuin Technology | | hgissbkh | Hippolyte | Gisserot-Boukhlef | hippolyte.gisserot-boukhlef@centralesupelec.fr | ~Hippolyte_Gisserot-Boukhlef1 | CentraleSupélec & Artefact Research Center | | sted97 | Simone | Tedeschi | tedeschi@diag.uniroma1.it | ~Simone_Tedeschi1 | Sapienza University of Rome | -| gentaiscool | Genta Indra | Winata | genta.winata@capitalone.com | ~Genta_Indra_Winata1 | Capital One | \ No newline at end of file +| gentaiscool | Genta Indra | Winata | genta.winata@capitalone.com | ~Genta_Indra_Winata1 | Capital One | +| henilp105 | Henil | Panchal | henilp105@gmail.com | ~Henil_Shalin_Panchal1 | Nirma University | \ No newline at end of file diff --git a/docs/mmteb/points/936.jsonl b/docs/mmteb/points/936.jsonl new file mode 100644 index 0000000000..b0dd7bce57 --- /dev/null +++ b/docs/mmteb/points/936.jsonl @@ -0,0 +1,2 @@ +{"GitHub": "henilp105", "Dataset annotations": 9} +{"GitHub": "KennethEnevoldsen", "Review PR": 2} \ No newline at end of file diff --git a/mteb/cli.py b/mteb/cli.py index 9320149851..56efbe7176 100644 --- a/mteb/cli.py +++ b/mteb/cli.py @@ -271,7 +271,7 @@ def create_meta(args: argparse.Namespace) -> None: "dataset": { "type": task.metadata.dataset["path"], "name": f"MTEB {task.metadata.name} ({hf_subset_score['hf_subset']})", - "config": hf_subset_score["hf_subset"], + "config": hf_subset_score["hf_subset"], "split": split, "revision": task_result.dataset_revision, }, diff --git a/mteb/tasks/Classification/multilingual/AmazonReviewsClassification.py b/mteb/tasks/Classification/multilingual/AmazonReviewsClassification.py index f9e7807103..7500697a5f 100644 --- a/mteb/tasks/Classification/multilingual/AmazonReviewsClassification.py +++ b/mteb/tasks/Classification/multilingual/AmazonReviewsClassification.py @@ -26,15 +26,15 @@ class AmazonReviewsClassification(MultilingualTask, AbsTaskClassification): "zh": ["cmn-Hans"], }, main_score="accuracy", - date=None, - form=None, - domains=None, - task_subtypes=None, - license=None, - socioeconomic_status=None, - annotations_creators=None, - dialect=None, - text_creation=None, + date=("2015-11-01", "2019-11-01"), + form=["written"], + domains=["Reviews"], + task_subtypes=[], + license="https://docs.opendata.aws/amazon-reviews-ml/license.txt", + socioeconomic_status="medium", + annotations_creators="human-annotated", + dialect=[], + text_creation="found", bibtex_citation="""@misc{keung2020multilingual, title={The Multilingual Amazon Reviews Corpus}, author={Phillip Keung and Yichao Lu and György Szarvas and Noah A. Smith}, diff --git a/mteb/tasks/Classification/multilingual/MasakhaNEWSClassification.py b/mteb/tasks/Classification/multilingual/MasakhaNEWSClassification.py index 3684e1e4e1..d3f48c9aba 100644 --- a/mteb/tasks/Classification/multilingual/MasakhaNEWSClassification.py +++ b/mteb/tasks/Classification/multilingual/MasakhaNEWSClassification.py @@ -38,15 +38,15 @@ class MasakhaNEWSClassification(AbsTaskClassification, MultilingualTask): eval_splits=["test"], eval_langs=_LANGUAGES, main_score="accuracy", - date=None, - form=None, - domains=None, - task_subtypes=None, - license=None, - socioeconomic_status=None, - annotations_creators=None, - dialect=None, - text_creation=None, + date=("2023-01-01", "2023-04-19"), # rough estimate + form=["written"], + domains=["News"], + task_subtypes=[], + license="cc-by-nc-4.0", + socioeconomic_status="high", + annotations_creators="expert-annotated", + dialect=[], + text_creation="found", bibtex_citation="""@misc{adelani2023masakhanews, title={MasakhaNEWS: News Topic Classification for African languages}, author={David Ifeoluwa Adelani and Marek Masiak and Israel Abebe Azime and Jesujoba Alabi and Atnafu Lambebo Tonja and Christine Mwase and Odunayo Ogundepo and Bonaventure F. P. Dossou and Akintunde Oladipo and Doreen Nixdorf and Chris Chinenye Emezue and sana al-azzawi and Blessing Sibanda and Davis David and Lolwethu Ndolela and Jonathan Mukiibi and Tunde Ajayi and Tatiana Moteu and Brian Odhiambo and Abraham Owodunni and Nnaemeka Obiefuna and Muhidin Mohamed and Shamsuddeen Hassan Muhammad and Teshome Mulugeta Ababu and Saheed Abdullahi Salahudeen and Mesay Gemeda Yigezu and Tajuddeen Gwadabe and Idris Abdulmumin and Mahlet Taye and Oluwabusayo Awoyomi and Iyanuoluwa Shode and Tolulope Adelani and Habiba Abdulganiyu and Abdul-Hakeem Omotayo and Adetola Adeeko and Abeeb Afolabi and Anuoluwapo Aremu and Olanrewaju Samuel and Clemencia Siro and Wangari Kimotho and Onyekachi Ogbu and Chinedu Mbonu and Chiamaka Chukwuneke and Samuel Fanijo and Jessica Ojo and Oyinkansola Awosan and Tadesse Kebede and Toadoum Sari Sakayo and Pamela Nyatsine and Freedmore Sidume and Oreen Yousuf and Mardiyyah Oduwole and Tshinu Tshinu and Ussen Kimanuka and Thina Diko and Siyanda Nxakama and Sinodos Nigusse and Abdulmejid Johar and Shafie Mohamed and Fuad Mire Hassan and Moges Ahmed Mehamed and Evrard Ngabire and Jules Jules and Ivan Ssenkungu and Pontus Stenetorp}, diff --git a/mteb/tasks/Classification/multilingual/MassiveIntentClassification.py b/mteb/tasks/Classification/multilingual/MassiveIntentClassification.py index 30c3df480d..5a6242c6f9 100644 --- a/mteb/tasks/Classification/multilingual/MassiveIntentClassification.py +++ b/mteb/tasks/Classification/multilingual/MassiveIntentClassification.py @@ -68,7 +68,7 @@ class MassiveIntentClassification(MultilingualTask, AbsTaskClassification): "revision": "4672e20407010da34463acc759c162ca9734bca6", }, description="MASSIVE: A 1M-Example Multilingual Natural Language Understanding Dataset with 51 Typologically-Diverse Languages", - reference="https://arxiv.org/abs/2204.08582#:~:text=MASSIVE%20contains%201M%20realistic%2C%20parallel,diverse%20languages%20from%2029%20genera.", + reference="https://arxiv.org/abs/2204.08582", category="s2s", type="Classification", eval_splits=["validation", "test"], diff --git a/mteb/tasks/Classification/multilingual/MassiveScenarioClassification.py b/mteb/tasks/Classification/multilingual/MassiveScenarioClassification.py index cae0ebf2bf..d129cd7d75 100644 --- a/mteb/tasks/Classification/multilingual/MassiveScenarioClassification.py +++ b/mteb/tasks/Classification/multilingual/MassiveScenarioClassification.py @@ -68,7 +68,7 @@ class MassiveScenarioClassification(MultilingualTask, AbsTaskClassification): "revision": "fad2c6e8459f9e1c45d9315f4953d921437d70f8", }, description="MASSIVE: A 1M-Example Multilingual Natural Language Understanding Dataset with 51 Typologically-Diverse Languages", - reference="https://arxiv.org/abs/2204.08582#:~:text=MASSIVE%20contains%201M%20realistic%2C%20parallel,diverse%20languages%20from%2029%20genera.", + reference="https://arxiv.org/abs/2204.08582", category="s2s", type="Classification", eval_splits=["validation", "test"], diff --git a/mteb/tasks/Clustering/deu/TenKGnadClusteringP2P.py b/mteb/tasks/Clustering/deu/TenKGnadClusteringP2P.py index c63e72d3af..866dd67c60 100644 --- a/mteb/tasks/Clustering/deu/TenKGnadClusteringP2P.py +++ b/mteb/tasks/Clustering/deu/TenKGnadClusteringP2P.py @@ -22,14 +22,14 @@ class TenKGnadClusteringP2P(AbsTaskClustering): eval_langs=["deu-Latn"], main_score="v_measure", date=None, - form=None, - domains=None, - task_subtypes=None, - license=None, - socioeconomic_status=None, + form=["written"], + domains=["Web"], + task_subtypes=[], + license="cc-by-nc-sa-4.0", + socioeconomic_status="mixed", annotations_creators=None, - dialect=None, - text_creation=None, + dialect=[], + text_creation="found", bibtex_citation=None, n_samples={"test": 45914}, avg_character_length={"test": 2641.03}, diff --git a/mteb/tasks/Reranking/eng/MindSmallReranking.py b/mteb/tasks/Reranking/eng/MindSmallReranking.py index 1009a97f88..c3db5502a2 100644 --- a/mteb/tasks/Reranking/eng/MindSmallReranking.py +++ b/mteb/tasks/Reranking/eng/MindSmallReranking.py @@ -20,15 +20,15 @@ class MindSmallReranking(AbsTaskReranking): eval_splits=["test"], eval_langs=["eng-Latn"], main_score="map", - date=None, - form=None, - domains=None, - task_subtypes=None, - license=None, - socioeconomic_status=None, - annotations_creators=None, - dialect=None, - text_creation=None, + date=("2019-10-12", "2019-11-22"), + form=["written"], + domains=["News"], + task_subtypes=[], + license="https://github.com/msnews/MIND/blob/master/MSR%20License_Data.pdf", + socioeconomic_status="mixed", + annotations_creators="expert-annotated", + dialect=[], + text_creation="found", bibtex_citation="""@inproceedings{wu-etal-2020-mind, title = "{MIND}: A Large-scale Dataset for News Recommendation", author = "Wu, Fangzhao and Qiao, Ying and Chen, Jiun-Hung and Wu, Chuhan and Qi, Tao and Lian, Jianxun and Liu, Danyang and Xie, Xing and Gao, Jianfeng and Wu, Winnie and Zhou, Ming", diff --git a/mteb/tasks/Reranking/zho/CMTEBReranking.py b/mteb/tasks/Reranking/zho/CMTEBReranking.py index d6f811e672..86b5328398 100644 --- a/mteb/tasks/Reranking/zho/CMTEBReranking.py +++ b/mteb/tasks/Reranking/zho/CMTEBReranking.py @@ -90,15 +90,15 @@ class CMedQAv1(AbsTaskReranking): eval_splits=["test"], eval_langs=["cmn-Hans"], main_score="map", - date=None, - form=None, - domains=None, - task_subtypes=None, - license=None, - socioeconomic_status=None, - annotations_creators=None, - dialect=None, - text_creation=None, + date=("2017-01-01", "2017-07-26"), + form=["written"], + domains=["Medical"], + task_subtypes=[], + license="not specified.", + socioeconomic_status="mixed", + annotations_creators="expert-annotated", + dialect=[], + text_creation="found", bibtex_citation="""@article{zhang2017chinese, title={Chinese Medical Question Answer Matching Using End-to-End Character-Level Multi-Scale CNNs}, author={Zhang, Sheng and Zhang, Xin and Wang, Hui and Cheng, Jiajun and Li, Pei and Ding, Zhaoyun}, @@ -109,8 +109,8 @@ class CMedQAv1(AbsTaskReranking): year={2017}, publisher={Multidisciplinary Digital Publishing Institute} }""", - n_samples=None, - avg_character_length=None, + n_samples={"test": 2000}, + avg_character_length={"test": 165}, ) diff --git a/mteb/tasks/Retrieval/eng/HagridRetrieval.py b/mteb/tasks/Retrieval/eng/HagridRetrieval.py index 59b96e29e2..119f286713 100644 --- a/mteb/tasks/Retrieval/eng/HagridRetrieval.py +++ b/mteb/tasks/Retrieval/eng/HagridRetrieval.py @@ -28,23 +28,23 @@ class HagridRetrieval(AbsTaskRetrieval): eval_splits=["dev"], eval_langs=["eng-Latn"], main_score="ndcg_at_10", - date=None, - form=None, - domains=None, - task_subtypes=None, - license=None, - socioeconomic_status=None, - annotations_creators=None, - dialect=None, - text_creation=None, + date=("2019-02-01", "2022-10-18"), + form=["written"], + domains=["Encyclopaedic"], + task_subtypes=[], + license="apache-2.0", + socioeconomic_status="mixed", + annotations_creators="expert-annotated", + dialect=[], + text_creation="found", bibtex_citation="""@article{hagrid, title={{HAGRID}: A Human-LLM Collaborative Dataset for Generative Information-Seeking with Attribution}, author={Ehsan Kamalloo and Aref Jafari and Xinyu Zhang and Nandan Thakur and Jimmy Lin}, year={2023}, journal={arXiv:2307.16883}, }""", - n_samples=None, - avg_character_length=None, + n_samples={"train": 1922}, + avg_character_length={"train": 14.53}, ) def load_data(self, **kwargs): diff --git a/mteb/tasks/Retrieval/fra/AlloprofRetrieval.py b/mteb/tasks/Retrieval/fra/AlloprofRetrieval.py index b3971465e0..6a1e3143d8 100644 --- a/mteb/tasks/Retrieval/fra/AlloprofRetrieval.py +++ b/mteb/tasks/Retrieval/fra/AlloprofRetrieval.py @@ -21,15 +21,15 @@ class AlloprofRetrieval(AbsTaskRetrieval): eval_splits=["test"], eval_langs=["fra-Latn"], main_score="ndcg_at_10", - date=None, - form=None, - domains=None, - task_subtypes=None, - license=None, - socioeconomic_status=None, - annotations_creators=None, - dialect=None, - text_creation=None, + date=None, # no date specified. + form=["written"], + domains=["Encyclopaedic"], + task_subtypes=[], + license="cc-by-nc-sa-4.0", + socioeconomic_status="mixed", + annotations_creators="human-annotated", + dialect=[], + text_creation="found", bibtex_citation="""@misc{lef23, doi = {10.48550/ARXIV.2302.07738}, url = {https://arxiv.org/abs/2302.07738}, @@ -40,8 +40,8 @@ class AlloprofRetrieval(AbsTaskRetrieval): year = {2023}, copyright = {Creative Commons Attribution Non Commercial Share Alike 4.0 International} }""", - n_samples=None, - avg_character_length=None, + n_samples={"train": 2048}, + avg_character_length=None, # unable to extract due to invalid hf dataset ) def load_data(self, **kwargs): diff --git a/mteb/tasks/Retrieval/fra/BSARDRetrieval.py b/mteb/tasks/Retrieval/fra/BSARDRetrieval.py index 899370a499..1774d14954 100644 --- a/mteb/tasks/Retrieval/fra/BSARDRetrieval.py +++ b/mteb/tasks/Retrieval/fra/BSARDRetrieval.py @@ -21,15 +21,15 @@ class BSARDRetrieval(AbsTaskRetrieval): eval_splits=["test"], eval_langs=["fra-Latn"], main_score="recall_at_100", - date=None, - form=None, - domains=None, - task_subtypes=None, - license=None, - socioeconomic_status=None, - annotations_creators=None, - dialect=None, - text_creation=None, + date=("2021-05-01", "2021-08-26"), + form=["spoken"], + domains=["Legal"], + task_subtypes=[], + license="cc-by-nc-sa-4.0", + socioeconomic_status="medium", + annotations_creators="expert-annotated", + dialect=[], + text_creation="found", bibtex_citation="""@inproceedings{louis2022statutory, title = {A Statutory Article Retrieval Dataset in French}, author = {Louis, Antoine and Spanakis, Gerasimos}, @@ -42,8 +42,8 @@ class BSARDRetrieval(AbsTaskRetrieval): doi = {10.18653/v1/2022.acl-long.468}, pages = {6789–6803}, }""", - n_samples=None, - avg_character_length=None, + n_samples={"test": 222}, + avg_character_length={"test": 71.94}, ) def load_data(self, **kwargs): diff --git a/mteb/tasks/Retrieval/fra/SyntecRetrieval.py b/mteb/tasks/Retrieval/fra/SyntecRetrieval.py index 710d7a9268..c4382f4b7e 100644 --- a/mteb/tasks/Retrieval/fra/SyntecRetrieval.py +++ b/mteb/tasks/Retrieval/fra/SyntecRetrieval.py @@ -23,15 +23,15 @@ class SyntecRetrieval(AbsTaskRetrieval): eval_splits=_EVAL_SPLITS, eval_langs=["fra-Latn"], main_score="ndcg_at_10", - date=None, - form=None, - domains=None, - task_subtypes=None, - license=None, - socioeconomic_status=None, - annotations_creators=None, + date=None, # not specified + form=["written"], + domains=["Legal"], + task_subtypes=[], + license="not specified.", + socioeconomic_status="high", + annotations_creators="human-annotated", dialect=[], - text_creation=None, + text_creation="created", bibtex_citation="""@misc{ciancone2024extending, title={Extending the Massive Text Embedding Benchmark to French}, author={Mathieu Ciancone and Imene Kerboua and Marion Schaeffer and Wissam Siblini},