Skip to content

Commit d185a29

Browse files
author
zhaoyingjun
committed
ReleaseV1.0
1 parent 0202b9a commit d185a29

File tree

4 files changed

+349
-0
lines changed

4 files changed

+349
-0
lines changed

.idea/misc.xml

+3
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,73 @@
1+
# coding=utf-8
2+
import json
3+
import os
4+
import re
5+
import jieba
6+
from zhon.hanzi import punctuation
7+
from config import getConfig
8+
import io
9+
import tensorflow as tf
10+
11+
# 加载参数配置文件
12+
gConfig = {}
13+
gConfig = getConfig.get_config()
14+
conv_path = gConfig['resource_data']
15+
vocab_inp_path = gConfig['vocab_inp_path']
16+
vocab_tar_path = gConfig['vocab_tar_path']
17+
vocab_inp_size = gConfig['vocab_inp_size']
18+
vocab_tar_size = gConfig['vocab_tar_size']
19+
seq_train = gConfig['seq_data']
20+
def predata_util():
21+
# 判断训练语料文件是否存在,如果不存在则进行提醒
22+
if not os.path.exists(conv_path):
23+
print("找不到需要处理的文件,请确认在train_data文件中是否存在该文件")
24+
exit()
25+
# 新建一个文件,用于存放处理后的对话语料
26+
seq_train = open(gConfig['seq_data'], 'w')
27+
# 打开需要处理的语料,逐条读取并进行数据处理
28+
with open(conv_path, encoding='utf-8') as f:
29+
one_conv = "" # 存储一次完整对话
30+
i = 0
31+
# 开始循环处理语料
32+
for line in f:
33+
line = line.strip('\n')
34+
line = re.sub(r"[%s]+" % punctuation, "", line) # 去除标点符号
35+
if line == '':
36+
continue
37+
# 判断是否为一段对话的开始,如果是则把刚刚处理的语料保存下来
38+
if line[0] == gConfig['e']:
39+
if one_conv:
40+
seq_train.write(one_conv[:-1] + '\n')
41+
i = i + 1
42+
if i % 1000 == 0:
43+
print('处理进度:', i)
44+
one_conv = ""
45+
# 判断是否正在处理对话语句,如果是则进行语料的拼接处理 以及分词
46+
elif line[0] == gConfig['m']:
47+
one_conv = one_conv + str(" ".join(jieba.cut(line.split(' ')[1]))) + '\t' # 存储一次问或答
48+
# 处理完成,关闭文件
49+
seq_train.close()
50+
51+
def create_vocab(lang, vocab_path, vocab_size):
52+
tokenizer = tf.keras.preprocessing.text.Tokenizer(num_words=vocab_size, oov_token=3)
53+
tokenizer.fit_on_texts(lang)
54+
vocab = json.loads(tokenizer.to_json(ensure_ascii=False))
55+
vocab['index_word'] = tokenizer.index_word
56+
vocab['word_index'] = tokenizer.word_index
57+
vocab['document_count']=tokenizer.document_count
58+
vocab = json.dumps(vocab, ensure_ascii=False)
59+
with open(vocab_path, 'w', encoding='utf-8') as f:
60+
f.write(vocab)
61+
f.close()
62+
print("字典保存在:{}".format(vocab_path))
63+
64+
def preprocess_sentence(w):
65+
w = 'start ' + w + ' end'
66+
return w
67+
lines = io.open(seq_train, encoding='UTF-8').readlines()
68+
word_pairs = [[preprocess_sentence(w) for w in l.split('\t')] for l in lines]
69+
input_lang, target_lang = zip(*word_pairs)
70+
predata_util()
71+
create_vocab(input_lang,vocab_inp_path,vocab_inp_size)
72+
create_vocab(target_lang,vocab_tar_path,vocab_tar_size)
73+
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,153 @@
1+
# coding=utf-8
2+
#导入依赖包
3+
import json
4+
import os
5+
import sys
6+
import time
7+
import tensorflow as tf
8+
import horovod.tensorflow as hvd
9+
import seq2seqModel
10+
from config import getConfig
11+
import io
12+
13+
hvd.init()
14+
#初始化超参字典,并对相应的参数进行赋值
15+
gConfig = {}
16+
gConfig= getConfig.get_config()
17+
vocab_inp_size = gConfig['vocab_inp_size']
18+
vocab_tar_size = gConfig['vocab_tar_size']
19+
embedding_dim=gConfig['embedding_dim']
20+
units=gConfig['layer_size']
21+
BATCH_SIZE=gConfig['batch_size']
22+
23+
max_length_inp=gConfig['max_length']
24+
max_length_tar=gConfig['max_length']
25+
26+
log_dir=gConfig['log_dir']
27+
writer = tf.summary.create_file_writer(log_dir)
28+
#对训练语料进行处理,上下文分别加上start end标示
29+
def preprocess_sentence(w):
30+
w ='start '+ w + ' end'
31+
return w
32+
#定义数据读取函数,从训练语料中读取数据并进行word2number的处理,并生成词典
33+
def read_data(path):
34+
path = os.getcwd() + '/' + path
35+
if not os.path.exists(path):
36+
path=os.path.dirname(os.getcwd())+'/'+ path
37+
lines = io.open(path, encoding='UTF-8').read().strip().split('\n')
38+
word_pairs = [[preprocess_sentence(w) for w in l.split('\t')] for l in lines]
39+
input_lang,target_lang=zip(*word_pairs)
40+
input_tokenizer=tokenize(gConfig['vocab_inp_path'])
41+
target_tokenizer=tokenize(gConfig['vocab_tar_path'])
42+
input_tensor=input_tokenizer.texts_to_sequences(input_lang)
43+
target_tensor=target_tokenizer.texts_to_sequences(target_lang)
44+
input_tensor = tf.keras.preprocessing.sequence.pad_sequences(input_tensor, maxlen=max_length_inp,
45+
padding='post')
46+
target_tensor= tf.keras.preprocessing.sequence.pad_sequences(target_tensor, maxlen=max_length_tar,
47+
padding='post')
48+
return input_tensor,input_tokenizer,target_tensor,target_tokenizer
49+
#定义word2number函数,通过对语料的处理提取词典,并进行word2number处理以及padding补全
50+
def tokenize(vocab_file):
51+
#从词典中读取预先生成tokenizer的config,构建词典矩阵
52+
with open(vocab_file,'r',encoding='utf-8') as f:
53+
tokenize_config=json.dumps(json.load(f),ensure_ascii=False)
54+
lang_tokenizer=tf.keras.preprocessing.text.tokenizer_from_json(tokenize_config)
55+
#利用词典进行word2number的转换以及padding处理
56+
return lang_tokenizer
57+
input_tensor, input_token, target_tensor, target_token = read_data(gConfig['seq_data'])
58+
steps_per_epoch = len(input_tensor) // (gConfig['batch_size']*hvd.size())
59+
BUFFER_SIZE = len(input_tensor)
60+
dataset = tf.data.Dataset.from_tensor_slices((input_tensor,target_tensor)).shuffle(BUFFER_SIZE)
61+
dataset = dataset.batch(BATCH_SIZE, drop_remainder=True)
62+
enc_hidden = seq2seqModel.encoder.initialize_hidden_state()
63+
dataset = dataset.shard(hvd.size(), hvd.rank())
64+
#定义训练函数
65+
def train():
66+
# 从训练语料中读取数据并使用预生成词典word2number的转换
67+
print("Preparing data in %s" % gConfig['train_data'])
68+
print('每个epoch的训练步数: {}'.format(steps_per_epoch))
69+
#如有已经有预训练的模型则加载预训练模型继续训练
70+
checkpoint_dir = gConfig['model_data']
71+
ckpt=tf.io.gfile.listdir(checkpoint_dir)
72+
if ckpt:
73+
print("reload pretrained model")
74+
seq2seqModel.checkpoint.restore(tf.train.latest_checkpoint(checkpoint_dir))
75+
76+
#使用Dataset加载训练数据,Dataset可以加速数据的并发读取并进行训练效率的优化
77+
checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt")
78+
start_time = time.time()
79+
#current_loss=2
80+
#min_loss=gConfig['min_loss']
81+
epoch = 0
82+
train_epoch = gConfig['train_epoch']
83+
#开始进行循环训练,这里设置了一个结束循环的条件就是当loss小于设置的min_loss超参时终止训练
84+
while epoch<train_epoch:
85+
start_time_epoch = time.time()
86+
total_loss = 0
87+
#进行一个epoch的训练,训练的步数为steps_per_epoch
88+
for batch,(inp, targ) in enumerate(dataset.take(steps_per_epoch)):
89+
batch_loss = seq2seqModel.training_step(inp, targ,target_token, enc_hidden,batch==0)
90+
total_loss += batch_loss
91+
print('epoch:{}batch:{} batch_loss: {}'.format(epoch,batch,batch_loss))
92+
#结束一个epoch的训练后,更新current_loss,计算在本epoch中每步训练平均耗时、loss值
93+
step_time_epoch = (time.time() - start_time_epoch) / steps_per_epoch
94+
step_loss = total_loss / steps_per_epoch
95+
current_steps = +steps_per_epoch
96+
epoch_time_total = (time.time() - start_time)
97+
print('训练总步数: {} 总耗时: {} epoch平均每步耗时: {} 平均每步loss {:.4f}'
98+
.format(current_steps, epoch_time_total, step_time_epoch, step_loss))
99+
#将本epoch训练的模型进行保存,更新模型文件
100+
seq2seqModel.checkpoint.save(file_prefix=checkpoint_prefix)
101+
sys.stdout.flush()
102+
epoch = epoch + 1
103+
with writer.as_default():
104+
tf.summary.scalar('loss', step_loss, step=epoch)
105+
#定义预测函数,用于根据上文预测下文对话
106+
def predict(sentence):
107+
# 从词典中读取预先生成tokenizer的config,构建词典矩阵
108+
input_tokenizer = tokenize(gConfig['vocab_inp_path'])
109+
target_tokenizer = tokenize(gConfig['vocab_tar_path'])
110+
#加载预训练的模型
111+
checkpoint_dir = gConfig['model_data']
112+
seq2seqModel.checkpoint.restore(tf.train.latest_checkpoint(checkpoint_dir))
113+
#对输入的语句进行处理,加上start end标示
114+
sentence = preprocess_sentence(sentence)
115+
#进行word2number的转换
116+
inputs = input_tokenizer.texts_to_sequences(sentence)
117+
#进行padding的补全
118+
inputs = tf.keras.preprocessing.sequence.pad_sequences([inputs],maxlen=max_length_inp,padding='post')
119+
inputs = tf.convert_to_tensor(inputs)
120+
result = ''
121+
#初始化一个中间状态
122+
hidden = [tf.zeros((1, units))]
123+
#对输入上文进行encoder编码,提取特征
124+
enc_out, enc_hidden = seq2seqModel.encoder(inputs, hidden)
125+
dec_hidden = enc_hidden
126+
#decoder的输入从start的对应Id开始正向输入
127+
dec_input = tf.expand_dims([target_tokenizer.word_index['start']], 0)
128+
#在最大的语句长度范围内容,使用模型中的decoder进行循环解码
129+
for t in range(max_length_tar):
130+
#获得解码结果,并使用argmax确定概率最大的id
131+
predictions, dec_hidden, attention_weights = seq2seqModel.decoder(dec_input, dec_hidden, enc_out)
132+
predicted_id = tf.argmax(predictions[0]).numpy()
133+
#判断当前Id是否为语句结束表示,如果是则停止循环解码,否则进行number2word的转换,并进行语句拼接
134+
if target_tokenizer.index_word[predicted_id] == 'end':
135+
break
136+
result += str(target_tokenizer.index_word[predicted_id]) + ' '
137+
#将预测得到的id作为下一个时刻的decoder的输入
138+
dec_input = tf.expand_dims([predicted_id], 0)
139+
return result
140+
#main函数的入口,根据超参设置的模式启动不同工作模式
141+
if __name__ == '__main__':
142+
#如果在启动python程序时指定了超参文件,则从超参文件中读取超参,否则从默认的超参文件中读取
143+
if len(sys.argv) - 1:
144+
gConfig = getConfig.get_config(sys.argv[1])
145+
else:
146+
gConfig = getConfig.get_config()
147+
print('\n>> 执行器模式 : %s\n' %(gConfig['mode']))
148+
if gConfig['mode'] == 'train':
149+
print('现在进行模型的训练')
150+
train()
151+
elif gConfig['mode'] == 'serve':
152+
print('当前为服务模式,请运行web程序,进行人机交互')
153+
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,120 @@
1+
#导入依赖包
2+
import tensorflow as tf
3+
from config import getConfig
4+
import horovod.tensorflow as hvd
5+
tf.config.experimental_run_functions_eagerly(True)
6+
hvd.init()
7+
#初始化超参字典
8+
gConfig = {}
9+
gConfig= getConfig.get_config()
10+
#通过超参字典为vocab_inp_size、vocab_tar_size、embedding_dim、units等赋值
11+
vocab_inp_size = gConfig['vocab_inp_size']
12+
vocab_tar_size = gConfig['vocab_tar_size']
13+
embedding_dim=gConfig['embedding_dim']
14+
units=gConfig['layer_size']
15+
BATCH_SIZE=gConfig['batch_size']*hvd.size()
16+
17+
#定义Encoder类
18+
class Encoder(tf.keras.Model):
19+
#初始化函数,对默认参数进行初始化
20+
def __init__(self, vocab_size, embedding_dim, enc_units, batch_size):
21+
super(Encoder, self).__init__()
22+
self.enc_units = enc_units
23+
self.batch_size = batch_size
24+
self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)
25+
self.gru = tf.keras.layers.GRU(self.enc_units,return_sequences=True,return_state=True,recurrent_initializer='glorot_uniform')
26+
#定义调用函数,实现逻辑计算
27+
def call(self, x, hidden):
28+
x_emb = self.embedding(x)
29+
output, state = self.gru(x_emb, initial_state = hidden)
30+
return output, state
31+
def initialize_hidden_state(self):
32+
return tf.zeros((self.batch_size, self.enc_units))
33+
34+
#定义bahdanauAttention类,bahdanauAttention是常用的attention实现方法之一
35+
class BahdanauAttention(tf.keras.Model):
36+
def __init__(self, units):
37+
super(BahdanauAttention, self).__init__()
38+
#注意力网络的初始化
39+
self.W1 = tf.keras.layers.Dense(units)
40+
self.W2 = tf.keras.layers.Dense(units)
41+
self.V = tf.keras.layers.Dense(1)
42+
def call(self, query, values):
43+
#将query增加一个维度,以便可以与values进行线性相加
44+
hidden_with_time_axis = tf.expand_dims(query, 1)
45+
#将quales与hidden_with_time_axis进行线性相加后,使用tanh进行非线性变换,最后输出一维的score
46+
score = self.V(tf.nn.tanh(
47+
self.W1(values) + self.W2(hidden_with_time_axis)))
48+
#使用softmax将score进行概率化转换,转为为概率空间
49+
attention_weights = tf.nn.softmax(score, axis=1)
50+
#将权重与values(encoder_out)进行相乘,得到context_vector
51+
context_vector = attention_weights * values
52+
#将乘机后的context_vector按行相加,进行压缩得到最终的context_vector
53+
context_vector = tf.reduce_sum(context_vector, axis=1)
54+
return context_vector, attention_weights
55+
56+
class Decoder(tf.keras.Model):
57+
def __init__(self, vocab_size, embedding_dim, dec_units, batch_sz):
58+
super(Decoder, self).__init__()
59+
#初始化batch_sz、dec_units、embedding 、gru 、fc、attention
60+
self.batch_sz = batch_sz
61+
self.dec_units = dec_units
62+
self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)
63+
self.gru = tf.keras.layers.GRU(self.dec_units,
64+
return_sequences=True,
65+
return_state=True,
66+
recurrent_initializer='glorot_uniform')
67+
self.fc = tf.keras.layers.Dense(vocab_size)
68+
self.attention = BahdanauAttention(self.dec_units)
69+
def call(self, y, hidden, enc_output):
70+
#首先对enc_output、以及decoder的hidden计算attention,输出上下文语境向量
71+
context_vector, attention_weights = self.attention(hidden, enc_output)
72+
#对decoder的输入进行embedding
73+
y = self.embedding(y)
74+
#拼接上下文语境与decoder的输入embedding,并送入gru中
75+
y = tf.concat([tf.expand_dims(context_vector, 1), y], axis=-1)
76+
output, state = self.gru(y)
77+
#将gru的输出进行维度转换,送入全连接神经网络 得到最后的结果
78+
output = tf.reshape(output, (-1, output.shape[2]))
79+
y = self.fc(output)
80+
return y, state, attention_weights
81+
#定义损失函数
82+
def loss_function(real, pred):
83+
loss_object = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
84+
#mask掉start,去除start对于loss的干扰
85+
mask = tf.math.logical_not(tf.math.equal(real, 0))
86+
loss_ = loss_object(real, pred)
87+
mask = tf.cast(mask, dtype=loss_.dtype)#将bool型转换成数值
88+
loss_ *= mask
89+
return tf.reduce_mean(loss_)
90+
#实例化encoder、decoder、optimizer、checkpoint等
91+
encoder = Encoder(vocab_inp_size, embedding_dim, units, BATCH_SIZE)
92+
decoder = Decoder(vocab_tar_size, embedding_dim, units, BATCH_SIZE)
93+
optimizer = tf.keras.optimizers.Adam()
94+
checkpoint = tf.train.Checkpoint(optimizer=optimizer,encoder=encoder,decoder=decoder)
95+
@tf.function
96+
def training_step(inp, targ, targ_lang,enc_hidden,first_batch,allreduce=True):
97+
loss = 0
98+
with tf.GradientTape() as tape:
99+
enc_output, enc_hidden = encoder(inp, enc_hidden)
100+
dec_hidden = enc_hidden
101+
dec_input = tf.expand_dims([targ_lang.word_index['start']] * BATCH_SIZE, 1)
102+
for t in range(1, targ.shape[1]):
103+
predictions, dec_hidden, _ = decoder(dec_input, dec_hidden, enc_output)
104+
loss += loss_function(targ[:, t], predictions)
105+
dec_input = tf.expand_dims(targ[:, t], 1)
106+
step_loss = (loss / int(targ.shape[1]))
107+
variables = encoder.trainable_variables + decoder.trainable_variables
108+
if allreduce:
109+
tape = hvd.DistributedGradientTape(tape)
110+
gradients = tape.gradient(loss, variables)
111+
optimizer.apply_gradients(zip(gradients, variables))
112+
if first_batch:
113+
hvd.broadcast_variables(variables, root_rank=0)
114+
hvd.broadcast_variables(optimizer.variables(), root_rank=0)
115+
return step_loss
116+
117+
118+
119+
120+

0 commit comments

Comments
 (0)