-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathdataset_download.py
55 lines (44 loc) · 2.02 KB
/
dataset_download.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
import os
import json
from tqdm import tqdm
# 下载预训练数据集
os.system("modelscope download --dataset ddzhu123/seq-monkey mobvoi_seq_monkey_general_open_corpus.jsonl.tar.bz2 --local_dir your_local_dir")
# 解压预训练数据集
os.system("tar -xvf your_local_dir/mobvoi_seq_monkey_general_open_corpus.jsonl.tar.bz2")
# 下载SFT数据集
os.system(f'huggingface-cli download --repo-type dataset --resume-download BelleGroup/train_3.5M_CN --local-dir BelleGroup')
# 1 处理预训练数据
def split_text(text, chunk_size=512):
"""将文本按指定长度切分成块"""
return [text[i:i+chunk_size] for i in range(0, len(text), chunk_size)]
input_file = 'mobvoi_seq_monkey_general_open_corpus.jsonl'
with open('seq_monkey_datawhale.jsonl', 'a', encoding='utf-8') as pretrain:
with open(input_file, 'r', encoding='utf-8') as f:
data = f.readlines()
for line in tqdm(data, desc=f"Processing lines in {input_file}", leave=False): # 添加行级别的进度条
line = json.loads(line)
text = line['text']
chunks = split_text(text)
for chunk in chunks:
pretrain.write(json.dumps({'text': chunk}, ensure_ascii=False) + '\n')
# 2 处理SFT数据
def convert_message(data):
"""
将原始数据转换为标准格式
"""
message = [
{"role": "system", "content": "你是一个AI助手"},
]
for item in data:
if item['from'] == 'human':
message.append({'role': 'user', 'content': item['value']})
elif item['from'] == 'assistant':
message.append({'role': 'assistant', 'content': item['value']})
return message
with open('BelleGroup_sft.jsonl', 'a', encoding='utf-8') as sft:
with open('BelleGroup/train_3.5M_CN.json', 'r') as f:
data = f.readlines()
for item in tqdm(data, desc="Processing", unit="lines"):
item = json.loads(item)
message = convert_message(item['conversations'])
sft.write(json.dumps(message, ensure_ascii=False) + '\n')