Skip to content

Commit

Permalink
fix: Add DBpedia dataset (#501)
Browse files Browse the repository at this point in the history
* add dbpedia dataset

* add points

* PR comments

* calc results again

* fill all metadata

* fix test

* Update docs/mmteb/points/501.jsonl

---------

Co-authored-by: Isaac Chung <chungisaac1217@gmail.com>
  • Loading branch information
dipam7 and isaac-chung authored Apr 26, 2024
1 parent 3348e6e commit c49de83
Show file tree
Hide file tree
Showing 5 changed files with 82 additions and 0 deletions.
3 changes: 3 additions & 0 deletions docs/mmteb/points/501.jsonl
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
{"GitHub": "dipam7", "New dataset": 2}
{"GitHub": "isaac-chung", "Review PR": 2}
{"GitHub": "imenelydiaker", "Review PR": 2}
1 change: 1 addition & 0 deletions mteb/tasks/Classification/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@
from .eng.ContractNLIExplicitIdentificationLegalBenchClassification import *
from .eng.ContractNLIInclusionOfVerballyConveyedInformationLegalBenchClassification import *
from .eng.ContractNLILimitedUseLegalBenchClassification import *
from .eng.DBpediaClassification import *
from .eng.EmotionClassification import *
from .eng.FinancialPhrasebankClassification import *
from .eng.ImdbClassification import *
Expand Down
52 changes: 52 additions & 0 deletions mteb/tasks/Classification/eng/DBpediaClassification.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
from __future__ import annotations

from mteb.abstasks.TaskMetadata import TaskMetadata

from ....abstasks import AbsTaskClassification


class DBpediaClassification(AbsTaskClassification):
metadata = TaskMetadata(
name="DBpediaClassification",
description="DBpedia14 is a dataset of English texts from Wikipedia articles, categorized into 14 non-overlapping classes based on their DBpedia ontology.",
reference="https://arxiv.org/abs/1509.01626",
dataset={
"path": "fancyzhx/dbpedia_14",
"revision": "9abd46cf7fc8b4c64290f26993c540b92aa145ac",
},
type="Classification",
category="s2s",
eval_splits=["test"],
eval_langs=["eng-Latn"],
main_score="accuracy",
date=("2022-01-25", "2022-01-25"),
form=["written"],
domains=["Encyclopaedic"],
task_subtypes=["Topic classification"],
license="cc-by-sa-3.0",
socioeconomic_status="low",
annotations_creators="derived",
dialect=[],
text_creation="found",
bibtex_citation="""
@inproceedings{NIPS2015_250cf8b5,
author = {Zhang, Xiang and Zhao, Junbo and LeCun, Yann},
booktitle = {Advances in Neural Information Processing Systems},
editor = {C. Cortes and N. Lawrence and D. Lee and M. Sugiyama and R. Garnett},
pages = {},
publisher = {Curran Associates, Inc.},
title = {Character-level Convolutional Networks for Text Classification},
url = {https://proceedings.neurips.cc/paper_files/paper/2015/file/250cf8b51c773f3f8dc8b4be867a9a02-Paper.pdf},
volume = {28},
year = {2015}
}
""",
n_samples={"test": 70000},
avg_character_length={"test": 281.40},
)

def dataset_transform(self):
self.dataset = self.dataset.rename_column("content", "text")
self.dataset = self.stratified_subsampling(
self.dataset, seed=self.seed, splits=["train", "test"]
)
13 changes: 13 additions & 0 deletions results/intfloat__multilingual-e5-small/DBpediaClassification.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
{
"dataset_revision": "9abd46cf7fc8b4c64290f26993c540b92aa145ac",
"mteb_dataset_name": "DBpediaClassification",
"mteb_version": "1.6.36",
"test": {
"accuracy": 0.871435546875,
"accuracy_stderr": 0.007089500734212557,
"evaluation_time": 38.69,
"f1": 0.863423609172077,
"f1_stderr": 0.008270870625325584,
"main_score": 0.871435546875
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
{
"dataset_revision": "9abd46cf7fc8b4c64290f26993c540b92aa145ac",
"mteb_dataset_name": "DBpediaClassification",
"mteb_version": "1.6.36",
"test": {
"accuracy": 0.850830078125,
"accuracy_stderr": 0.013677541375994482,
"evaluation_time": 34.75,
"f1": 0.8482267879321842,
"f1_stderr": 0.01348333759139074,
"main_score": 0.850830078125
}
}

0 comments on commit c49de83

Please sign in to comment.