Skip to content

Commit 765c606

Browse files
neuMARCO (#181)
* neumarco - wip * added documentation
1 parent c348cd4 commit 765c606

File tree

6 files changed

+290
-0
lines changed

6 files changed

+290
-0
lines changed

ir_datasets/datasets/__init__.py

+1
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,7 @@
2929
from . import msmarco_passage
3030
from . import msmarco_passage_v2
3131
from . import msmarco_qna
32+
from . import neumarco
3233
from . import nfcorpus
3334
from . import natural_questions
3435
from . import nyt

ir_datasets/datasets/neumarco.py

+64
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,64 @@
1+
import io
2+
import codecs
3+
import re
4+
import ir_datasets
5+
from ir_datasets.util import DownloadConfig, TarExtract, Cache
6+
from ir_datasets.datasets.base import Dataset, YamlDocumentation
7+
from ir_datasets.datasets import msmarco_passage
8+
from ir_datasets.formats import TsvDocs
9+
10+
NAME = 'neumarco'
11+
12+
13+
def _init():
14+
documentation = YamlDocumentation(f'docs/{NAME}.yaml')
15+
base_path = ir_datasets.util.home_path()/NAME
16+
dlc = DownloadConfig.context(NAME, base_path)
17+
18+
subsets = {}
19+
20+
subsets_from_msmarco = {
21+
'train': [
22+
ir_datasets.registry['msmarco-passage/train'].queries_handler(),
23+
ir_datasets.registry['msmarco-passage/train'].qrels_handler(),
24+
ir_datasets.registry['msmarco-passage/train'].docpairs_handler(),
25+
],
26+
'train/judged': [
27+
ir_datasets.registry['msmarco-passage/train/judged'].queries_handler(),
28+
ir_datasets.registry['msmarco-passage/train/judged'].qrels_handler(),
29+
ir_datasets.registry['msmarco-passage/train/judged'].docpairs_handler(),
30+
],
31+
'dev': [
32+
ir_datasets.registry['msmarco-passage/dev'].queries_handler(),
33+
ir_datasets.registry['msmarco-passage/dev'].qrels_handler(),
34+
],
35+
'dev/small': [
36+
ir_datasets.registry['msmarco-passage/dev/small'].queries_handler(),
37+
ir_datasets.registry['msmarco-passage/dev/small'].qrels_handler(),
38+
],
39+
'dev/judged': [
40+
ir_datasets.registry['msmarco-passage/dev/judged'].queries_handler(),
41+
ir_datasets.registry['msmarco-passage/dev/judged'].qrels_handler(),
42+
]
43+
}
44+
45+
base_dlc = dlc['main']
46+
47+
for lang3, lang2 in [('fas', 'fa'), ('zho', 'zh'), ('rus', 'ru')]:
48+
corpus_dlc = Cache(TarExtract(base_dlc, f'eng-{lang3}/msmarco.collection.20210731-scale21-sockeye2-tm1.tsv'), base_path/f'{lang2}.tsv')
49+
collection = TsvDocs(corpus_dlc, namespace=f'{NAME}/{lang2}', lang=lang2, count_hint=ir_datasets.util.count_hint(f'{NAME}/{lang2}'))
50+
subsets[f'{lang2}'] = Dataset(collection, documentation(f'{lang2}'))
51+
for s, items in subsets_from_msmarco.items():
52+
subsets[f'{lang2}/{s}'] = Dataset(
53+
collection,
54+
*items,
55+
documentation(f'{lang2}/{s}'))
56+
57+
ir_datasets.registry.register(NAME, Dataset(documentation('_')))
58+
for s in sorted(subsets):
59+
ir_datasets.registry.register(f'{NAME}/{s}', subsets[s])
60+
61+
return collection, subsets
62+
63+
64+
collection, subsets = _init()

ir_datasets/docs/neumarco.yaml

+105
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,105 @@
1+
_:
2+
pretty_name: "neuMARCO"
3+
desc: '
4+
<p>
5+
A version of <a class="ds-ref">msmarco-passage</a> for cross-language
6+
information retrieval, provided by <a href="https://hltcoe.jhu.edu/">JHU HLTCOE</a> with documents
7+
translated to other langauges using a <a href="https://www.amazon.science/publications/sockeye-2-a-toolkit-for-neural-machine-translation">
8+
Sockeye 2</a> translation model.
9+
</p>
10+
<ul>
11+
<li>Documents: Web passages using machine translation to English</li>
12+
<li>Queries: Natural-language web queries in English</li>
13+
</ul>
14+
'
15+
16+
fa:
17+
desc: '
18+
<p>The <a class="ds-ref">msmarco-passage</a> corpus, translated to Persian (Farsi).</p>
19+
'
20+
21+
fa/dev:
22+
desc: '
23+
<p>A version of <a class="ds-ref">msmarco-passage/dev</a>, with the corpus translated to Persian (Farsi).</p>
24+
'
25+
26+
fa/dev/judged:
27+
desc: '
28+
<p>A version of <a class="ds-ref">msmarco-passage/dev/judged</a>, with the corpus translated to Persian (Farsi).</p>
29+
'
30+
31+
fa/dev/small:
32+
desc: '
33+
<p>A version of <a class="ds-ref">msmarco-passage/dev/small</a>, with the corpus translated to Persian (Farsi).</p>
34+
'
35+
36+
fa/train:
37+
desc: '
38+
<p>A version of <a class="ds-ref">msmarco-passage/train</a>, with the corpus translated to Persian (Farsi).</p>
39+
'
40+
41+
fa/train/judged:
42+
desc: '
43+
<p>A version of <a class="ds-ref">msmarco-passage/train/judged</a>, with the corpus translated to Persian (Farsi).</p>
44+
'
45+
46+
ru:
47+
desc: '
48+
<p>The <a class="ds-ref">msmarco-passage</a> corpus, translated to Russian.</p>
49+
'
50+
51+
ru/dev:
52+
desc: '
53+
<p>A version of <a class="ds-ref">msmarco-passage/dev</a>, with the corpus translated to Russian.</p>
54+
'
55+
56+
ru/dev/judged:
57+
desc: '
58+
<p>A version of <a class="ds-ref">msmarco-passage/dev/judged</a>, with the corpus translated to Russian.</p>
59+
'
60+
61+
ru/dev/small:
62+
desc: '
63+
<p>A version of <a class="ds-ref">msmarco-passage/dev/small</a>, with the corpus translated to Russian.</p>
64+
'
65+
66+
ru/train:
67+
desc: '
68+
<p>A version of <a class="ds-ref">msmarco-passage/train</a>, with the corpus translated to Russian.</p>
69+
'
70+
71+
ru/train/judged:
72+
desc: '
73+
<p>A version of <a class="ds-ref">msmarco-passage/train/judged</a>, with the corpus translated to Russian.</p>
74+
'
75+
76+
zh:
77+
desc: '
78+
<p>The <a class="ds-ref">msmarco-passage</a> corpus, translated to Chinese.</p>
79+
'
80+
81+
zh/dev:
82+
desc: '
83+
<p>A version of <a class="ds-ref">msmarco-passage/dev</a>, with the corpus translated to Chinese.</p>
84+
'
85+
86+
zh/dev/judged:
87+
desc: '
88+
<p>A version of <a class="ds-ref">msmarco-passage/dev/judged</a>, with the corpus translated to Chinese.</p>
89+
'
90+
91+
zh/dev/small:
92+
desc: '
93+
<p>A version of <a class="ds-ref">msmarco-passage/dev/small</a>, with the corpus translated to Chinese.</p>
94+
'
95+
96+
zh/train:
97+
desc: '
98+
<p>A version of <a class="ds-ref">msmarco-passage/train</a>, with the corpus translated to Chinese.</p>
99+
'
100+
101+
zh/train/judged:
102+
desc: '
103+
<p>A version of <a class="ds-ref">msmarco-passage/train/judged</a>, with the corpus translated to Chinese.</p>
104+
'
105+

ir_datasets/etc/downloads.json

+9
Original file line numberDiff line numberDiff line change
@@ -2861,6 +2861,15 @@
28612861
}
28622862
},
28632863

2864+
"neumarco": {
2865+
"main": {
2866+
"url": "https://livejohnshopkins-my.sharepoint.com/:u:/g/personal/dlawrie1_jh_edu/EQcICtPaSqFNoCZHtoeZszoB7FC362BvaPvieUSk2j30tA?download=1",
2867+
"size_hint": 3723728998,
2868+
"expected_md5": "733181c211959a7c09c695bfcddaea54",
2869+
"cache_path": "neuMSMARCO.tar.gz"
2870+
}
2871+
},
2872+
28642873
"nfcorpus": {
28652874
"main": {
28662875
"url": "https://www.cl.uni-heidelberg.de/statnlpgroup/nfcorpus/nfcorpus.tar.gz",

ir_datasets/etc/metadata.json

+19
Original file line numberDiff line numberDiff line change
@@ -458,6 +458,25 @@
458458
"neuclir/1/ru/hc4-filtered": {"docs": {"count": 964719, "fields": {"doc_id": {"max_len": 36, "common_prefix": ""}}}, "queries": {"count": 54}, "qrels": {"count": 3235, "fields": {"relevance": {"counts_by_value": {"0": 2483, "1": 478, "3": 274}}}}},
459459
"neuclir/1/zh": {"docs": {"count": 3179209, "fields": {"doc_id": {"max_len": 36, "common_prefix": ""}}}},
460460
"neuclir/1/zh/hc4-filtered": {"docs": {"count": 519945, "fields": {"doc_id": {"max_len": 36, "common_prefix": ""}}}, "queries": {"count": 60}, "qrels": {"count": 3217, "fields": {"relevance": {"counts_by_value": {"0": 2651, "3": 344, "1": 222}}}}},
461+
"neumarco": {},
462+
"neumarco/fa": {"docs": {"count": 8841823, "fields": {"doc_id": {"max_len": 7, "common_prefix": ""}}}},
463+
"neumarco/fa/dev": {"docs": {"_ref": "neumarco/fa"}, "queries": {"count": 101093}, "qrels": {"count": 59273, "fields": {"relevance": {"counts_by_value": {"1": 59273}}}}},
464+
"neumarco/fa/dev/judged": {"docs": {"_ref": "neumarco/fa"}, "queries": {"count": 55578}, "qrels": {"_ref": "neumarco/fa/dev"}},
465+
"neumarco/fa/dev/small": {"docs": {"_ref": "neumarco/fa"}, "queries": {"count": 6980}, "qrels": {"count": 7437, "fields": {"relevance": {"counts_by_value": {"1": 7437}}}}},
466+
"neumarco/fa/train": {"docs": {"_ref": "neumarco/fa"}, "queries": {"count": 808731}, "qrels": {"count": 532761, "fields": {"relevance": {"counts_by_value": {"1": 532761}}}}, "docpairs": {"count": 269919004}},
467+
"neumarco/fa/train/judged": {"docs": {"_ref": "neumarco/fa"}, "queries": {"count": 502939}, "qrels": {"_ref": "neumarco/fa/train"}, "docpairs": {"_ref": "neumarco/fa/train"}},
468+
"neumarco/ru": {"docs": {"count": 8841823, "fields": {"doc_id": {"max_len": 7, "common_prefix": ""}}}},
469+
"neumarco/ru/dev": {"docs": {"_ref": "neumarco/ru"}, "queries": {"count": 101093}, "qrels": {"count": 59273, "fields": {"relevance": {"counts_by_value": {"1": 59273}}}}},
470+
"neumarco/ru/dev/judged": {"docs": {"_ref": "neumarco/ru"}, "queries": {"count": 55578}, "qrels": {"_ref": "neumarco/ru/dev"}},
471+
"neumarco/ru/dev/small": {"docs": {"_ref": "neumarco/ru"}, "queries": {"count": 6980}, "qrels": {"count": 7437, "fields": {"relevance": {"counts_by_value": {"1": 7437}}}}},
472+
"neumarco/ru/train": {"docs": {"_ref": "neumarco/ru"}, "queries": {"count": 808731}, "qrels": {"count": 532761, "fields": {"relevance": {"counts_by_value": {"1": 532761}}}}, "docpairs": {"count": 269919004}},
473+
"neumarco/ru/train/judged": {"docs": {"_ref": "neumarco/ru"}, "queries": {"count": 502939}, "qrels": {"_ref": "neumarco/ru/train"}, "docpairs": {"_ref": "neumarco/ru/train"}},
474+
"neumarco/zh": {"docs": {"count": 8841823, "fields": {"doc_id": {"max_len": 7, "common_prefix": ""}}}},
475+
"neumarco/zh/dev": {"docs": {"_ref": "neumarco/zh"}, "queries": {"count": 101093}, "qrels": {"count": 59273, "fields": {"relevance": {"counts_by_value": {"1": 59273}}}}},
476+
"neumarco/zh/dev/judged": {"docs": {"_ref": "neumarco/zh"}, "queries": {"count": 55578}, "qrels": {"_ref": "neumarco/zh/dev"}},
477+
"neumarco/zh/dev/small": {"docs": {"_ref": "neumarco/zh"}, "queries": {"count": 6980}, "qrels": {"count": 7437, "fields": {"relevance": {"counts_by_value": {"1": 7437}}}}},
478+
"neumarco/zh/train": {"docs": {"_ref": "neumarco/zh"}, "queries": {"count": 808731}, "qrels": {"count": 532761, "fields": {"relevance": {"counts_by_value": {"1": 532761}}}}, "docpairs": {"count": 269919004}},
479+
"neumarco/zh/train/judged": {"docs": {"_ref": "neumarco/zh"}, "queries": {"count": 502939}, "qrels": {"_ref": "neumarco/zh/train"}, "docpairs": {"_ref": "neumarco/zh/train"}},
461480
"nfcorpus": {"docs": {"count": 5371, "fields": {"doc_id": {"max_len": 8, "common_prefix": "MED-"}}}},
462481
"nfcorpus/dev": {"docs": {"_ref": "nfcorpus"}, "queries": {"count": 325}, "qrels": {"count": 14589, "fields": {"relevance": {"counts_by_value": {"3": 521, "2": 10864, "1": 3204}}}}},
463482
"nfcorpus/dev/nontopic": {"docs": {"_ref": "nfcorpus"}, "queries": {"count": 144}, "qrels": {"count": 4353, "fields": {"relevance": {"counts_by_value": {"3": 521, "2": 3133, "1": 699}}}}},

test/integration/neumarco.py

+92
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,92 @@
1+
import re
2+
import unittest
3+
from ir_datasets.formats import TrecQrel, GenericQuery, GenericDoc, GenericDocPair
4+
from .base import DatasetIntegrationTest
5+
6+
7+
class TestNeuMarco(DatasetIntegrationTest):
8+
def test_docs(self):
9+
self._test_docs('neumarco/fa', count=8841823, items={
10+
0: GenericDoc('0', re.compile('^ حضور ارتباطات در میان ذهن \u200c های علمی برای موفقیت پروژه منهتن به همان اندازه مهم بود که هوش علمی بود.{57}می معلق است ، چیزی است که موفقیت آن \u200c ها به معنای واقعی آن است ؛ صدها هزار زندگی بی \u200c گناه نابود شد\\.$', flags=48)),
11+
9: GenericDoc('9', ' یکی از دلایل اصلی انتخاب هنفورد به عنوان یک مکان برای پروژه منهتن رآکتور B نزدیکی به رودخانه کلمبیا بود ، بزرگترین رودخانه \u200c ای که از ساحل آمریکای شمالی به اقیانوس آرام می \u200c ریزد.'),
12+
8841822: GenericDoc('8841822', re.compile('^ تصویر کامل اندازه را مشاهده کنید\\. پشت صحنه نور خیره کننده نشان می دهد که تماشاگران اووه و آه در چها.{272}ده نمک \u200c های فلزی و اکسید فلزی وجود دارد که برای تولید مجموعه \u200c ای از رنگ \u200c ها واکنش نشان می \u200c دهند\\.$', flags=48)),
13+
})
14+
self._test_docs('neumarco/ru', count=8841823, items={
15+
0: GenericDoc('0', re.compile('^ Присутствие общения среди научных умов было не менее важным для успеха Манхэттенского проекта, как .{98}лей и инженеров, это то, что их успех действительно означал; сотни тысяч невинных жизней уничтожены\\.$', flags=48)),
16+
9: GenericDoc('9', re.compile('^ Одной из главных причин, по которой Хэнфорд был выбран в качестве объекта для « B Reactor » Манхэтт.{31}сть к реке Колумбия, самой большой реке, протекающей в Тихий океан с северо американского побережья\\.$', flags=48)),
17+
8841822: GenericDoc('8841822', re.compile('^ Просмотр полноразмерного изображения\\. За кулисами ослепительного света видно, что зрители ooh и ahh.{313}ми, в основном солями металлов и оксидами металлов, которые реагируют на получение множества цветов\\.$', flags=48)),
18+
})
19+
self._test_docs('neumarco/zh', count=8841823, items={
20+
0: GenericDoc('0', ' 在科学头脑中的交流对曼哈顿项目的成功同样重要,因为科学智慧是科学智慧。 原子研究人员和工程师令人印象深刻的成就中唯一的云就是他们的成功真正意味着什么;数十万无辜的生命被消灭了。'),
21+
9: GenericDoc('9', ' 汉福德被选定为曼哈顿项目B反应堆的一个主要原因是它靠近哥伦比亚河,这是北美海岸流入太平洋的最大河流。'),
22+
8841822: GenericDoc('8841822', ' 查看全尺寸图像。 在耀眼的灯光的背后,7月4日的观众们都是精心制作的烟花。 不管是红色、白色和蓝色的喷泉还是紫色的火花,每个烟花都充满了正确的化学物质组合,以创造这些五颜六色的灯光。 在每一个手工烟花中,都有少量的特殊化学物质,主要是金属盐和金属氧化物,它们会反应产生一系列的颜色。'),
23+
})
24+
25+
def test_queries(self):
26+
for lang in ['fa', 'ru', 'zh']:
27+
self._test_queries(f'neumarco/{lang}/train', count=808731, items={
28+
0: GenericQuery('121352', 'define extreme'),
29+
9: GenericQuery('492875', 'sanitizer temperature'),
30+
808730: GenericQuery('50393', 'benefits of boiling lemons and drinking juice.')
31+
})
32+
self._test_queries(f'neumarco/{lang}/train/judged', count=502939, items={
33+
0: GenericQuery('121352', 'define extreme'),
34+
9: GenericQuery('54528', 'blood clots in urine after menopause'),
35+
502938: GenericQuery('50393', 'benefits of boiling lemons and drinking juice.')
36+
})
37+
self._test_queries(f'neumarco/{lang}/dev', count=101093, items={
38+
0: GenericQuery('1048578', 'cost of endless pools/swim spa'),
39+
9: GenericQuery('1048587', 'what is patron'),
40+
101092: GenericQuery('524285', 'treadmill incline meaning')
41+
})
42+
self._test_queries(f'neumarco/{lang}/dev/small', count=6980, items={
43+
0: GenericQuery('1048585', "what is paula deen's brother"),
44+
9: GenericQuery('524699', 'tricare service number'),
45+
6979: GenericQuery('1048565', 'who plays sebastian michaelis'),
46+
})
47+
self._test_queries(f'neumarco/{lang}/dev/judged', count=55578, items={
48+
0: GenericQuery('1048578', 'cost of endless pools/swim spa'),
49+
9: GenericQuery('1048601', 'what is pastoral medicine'),
50+
55577: GenericQuery('1048570', 'what is pearls before swine?')
51+
})
52+
53+
def test_qrels(self):
54+
for lang in ['fa', 'ru', 'zh']:
55+
self._test_qrels(f'neumarco/{lang}/train', count=532761, items={
56+
0: TrecQrel('1185869', '0', 1, '0'),
57+
9: TrecQrel('186154', '1160', 1, '0'),
58+
532760: TrecQrel('405466', '8841735', 1, '0')
59+
})
60+
self._test_qrels(f'neumarco/{lang}/train/judged', count=532761, items={
61+
0: TrecQrel('1185869', '0', 1, '0'),
62+
9: TrecQrel('186154', '1160', 1, '0'),
63+
532760: TrecQrel('405466', '8841735', 1, '0')
64+
})
65+
self._test_qrels(f'neumarco/{lang}/dev', count=59273, items={
66+
0: TrecQrel('1102432', '2026790', 1, '0'),
67+
9: TrecQrel('300674', '7067032', 1, '0'),
68+
59272: TrecQrel('371455', '8009476', 1, '0')
69+
})
70+
self._test_qrels(f'neumarco/{lang}/dev/small', count=7437, items={
71+
0: TrecQrel('300674', '7067032', 1, '0'),
72+
9: TrecQrel('54544', '7068203', 1, '0'),
73+
7436: TrecQrel('195199', '8009377', 1, '0'),
74+
})
75+
self._test_qrels(f'neumarco/{lang}/dev/judged', count=59273, items={
76+
0: TrecQrel('1102432', '2026790', 1, '0'),
77+
9: TrecQrel('300674', '7067032', 1, '0'),
78+
59272: TrecQrel('371455', '8009476', 1, '0')
79+
})
80+
81+
def test_docpairs(self):
82+
for lang in ['fa', 'ru', 'zh']:
83+
self._test_docpairs(f'neumarco/{lang}/train', count=269919004, items={
84+
0: GenericDocPair('662731', '193249', '2975302'),
85+
9: GenericDocPair('411362', '31018', '4238671'),
86+
269919003: GenericDocPair('88228', '5117891', '7075853')
87+
})
88+
89+
90+
91+
if __name__ == '__main__':
92+
unittest.main()

0 commit comments

Comments
 (0)