Skip to content

Commit 05cb8fc

Browse files
author
zhaoyingjun
committed
刷新README,更新路标和开源交流微信群,以便更方便的交流。
1 parent ccb5a77 commit 05cb8fc

29 files changed

+22954
-0
lines changed
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,15 @@
1+
# coding=utf-8
2+
import os
3+
from configparser import SafeConfigParser
4+
config_file=os.getcwd()+'/config/seq2seq.ini'
5+
if not os.path.exists(config_file):
6+
config_file = os.path.dirname(os.getcwd()) + '/config/seq2seq.ini'
7+
#print(config_file)
8+
def get_config():
9+
parser = SafeConfigParser()
10+
parser.read(config_file)
11+
# get the ints, floats and strings
12+
_conf_ints = [ (key, int(value)) for key,value in parser.items('ints')]
13+
_conf_floats = [ (key, float(value)) for key,value in parser.items('floats') ]
14+
_conf_strings = [ (key, str(value)) for key,value in parser.items('strings') ]
15+
return dict(_conf_ints +_conf_floats+ _conf_strings)
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,33 @@
1+
[strings]
2+
# Mode : train, test, serve
3+
mode = train
4+
train_data=train_data
5+
seq_data = train_data/seq.data
6+
vocab_inp_path=train_data/inp.vocab
7+
vocab_tar_path=train_data/tar.vocab
8+
#训练集原始文件
9+
resource_data = train_data/xiaohuangji50w.conv
10+
#分割后的训练样本文件
11+
split_train_data=train_data/seq_data_
12+
#读取识别原始文件中段落和行头的标示
13+
e = E
14+
m = M
15+
model_data = model_data
16+
log_dir=log_dir
17+
[ints]
18+
# vocabulary size
19+
# 20,000 is a reasonable size
20+
vocab_inp_size = 20000
21+
vocab_tar_size = 20000
22+
embedding_dim=128
23+
train_epoch=10
24+
# typical options : 128, 256, 512, 1024
25+
layer_size = 512
26+
batch_size = 64
27+
#句子的最长长度
28+
max_length=20
29+
number_work=2
30+
[floats]
31+
#设置最小Loss,当模型loss值达到这个水平后停止训练
32+
min_loss=0.2
33+
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,73 @@
1+
# coding=utf-8
2+
import json
3+
import os
4+
import re
5+
import jieba
6+
from zhon.hanzi import punctuation
7+
from config import getConfig
8+
import io
9+
import tensorflow as tf
10+
11+
# 加载参数配置文件
12+
gConfig = {}
13+
gConfig = getConfig.get_config()
14+
conv_path = gConfig['resource_data']
15+
vocab_inp_path = gConfig['vocab_inp_path']
16+
vocab_tar_path = gConfig['vocab_tar_path']
17+
vocab_inp_size = gConfig['vocab_inp_size']
18+
vocab_tar_size = gConfig['vocab_tar_size']
19+
seq_train = gConfig['seq_data']
20+
def predata_util():
21+
# 判断训练语料文件是否存在,如果不存在则进行提醒
22+
if not os.path.exists(conv_path):
23+
print("找不到需要处理的文件,请确认在train_data文件中是否存在该文件")
24+
exit()
25+
# 新建一个文件,用于存放处理后的对话语料
26+
seq_train = open(gConfig['seq_data'], 'w')
27+
# 打开需要处理的语料,逐条读取并进行数据处理
28+
with open(conv_path, encoding='utf-8') as f:
29+
one_conv = "" # 存储一次完整对话
30+
i = 0
31+
# 开始循环处理语料
32+
for line in f:
33+
line = line.strip('\n')
34+
line = re.sub(r"[%s]+" % punctuation, "", line) # 去除标点符号
35+
if line == '':
36+
continue
37+
# 判断是否为一段对话的开始,如果是则把刚刚处理的语料保存下来
38+
if line[0] == gConfig['e']:
39+
if one_conv:
40+
seq_train.write(one_conv[:-1] + '\n')
41+
i = i + 1
42+
if i % 1000 == 0:
43+
print('处理进度:', i)
44+
one_conv = ""
45+
# 判断是否正在处理对话语句,如果是则进行语料的拼接处理 以及分词
46+
elif line[0] == gConfig['m']:
47+
one_conv = one_conv + str(" ".join(jieba.cut(line.split(' ')[1]))) + '\t' # 存储一次问或答
48+
# 处理完成,关闭文件
49+
seq_train.close()
50+
51+
def create_vocab(lang, vocab_path, vocab_size):
52+
tokenizer = tf.keras.preprocessing.text.Tokenizer(num_words=vocab_size, oov_token=3)
53+
tokenizer.fit_on_texts(lang)
54+
vocab = json.loads(tokenizer.to_json(ensure_ascii=False))
55+
vocab['index_word'] = tokenizer.index_word
56+
vocab['word_index'] = tokenizer.word_index
57+
vocab['document_count']=tokenizer.document_count
58+
vocab = json.dumps(vocab, ensure_ascii=False)
59+
with open(vocab_path, 'w', encoding='utf-8') as f:
60+
f.write(vocab)
61+
f.close()
62+
print("字典保存在:{}".format(vocab_path))
63+
64+
def preprocess_sentence(w):
65+
w = 'start ' + w + ' end'
66+
return w
67+
lines = io.open(seq_train, encoding='UTF-8').readlines()
68+
word_pairs = [[preprocess_sentence(w) for w in l.split('\t')] for l in lines]
69+
input_lang, target_lang = zip(*word_pairs)
70+
predata_util()
71+
create_vocab(input_lang,vocab_inp_path,vocab_inp_size)
72+
create_vocab(target_lang,vocab_tar_path,vocab_tar_size)
73+
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,148 @@
1+
# coding=utf-8
2+
#导入依赖包
3+
import json
4+
import os
5+
import sys
6+
import time
7+
import tensorflow as tf
8+
import seq2seqModel
9+
from config import getConfig
10+
import io
11+
#初始化超参字典,并对相应的参数进行赋值
12+
gConfig = {}
13+
gConfig= getConfig.get_config()
14+
vocab_inp_size = gConfig['vocab_inp_size']
15+
vocab_tar_size = gConfig['vocab_tar_size']
16+
embedding_dim=gConfig['embedding_dim']
17+
units=gConfig['layer_size']
18+
BATCH_SIZE=gConfig['batch_size']
19+
20+
max_length_inp=gConfig['max_length']
21+
max_length_tar=gConfig['max_length']
22+
log_dir=gConfig['log_dir']
23+
writer = tf.summary.create_file_writer(log_dir)
24+
#对训练语料进行处理,上下文分别加上start end标示
25+
def preprocess_sentence(w):
26+
w ='start '+ w + ' end'
27+
return w
28+
#定义数据读取函数,从训练语料中读取数据并进行word2number的处理,并生成词典
29+
def read_data(path):
30+
path = os.getcwd() + '/' + path
31+
if not os.path.exists(path):
32+
path=os.path.dirname(os.getcwd())+'/'+ path
33+
lines = io.open(path, encoding='UTF-8').read().strip().split('\n')
34+
word_pairs = [[preprocess_sentence(w) for w in l.split('\t')] for l in lines]
35+
input_lang,target_lang=zip(*word_pairs)
36+
input_tokenizer=tokenize(gConfig['vocab_inp_path'])
37+
target_tokenizer=tokenize(gConfig['vocab_tar_path'])
38+
input_tensor=input_tokenizer.texts_to_sequences(input_lang)
39+
target_tensor=target_tokenizer.texts_to_sequences(target_lang)
40+
input_tensor = tf.keras.preprocessing.sequence.pad_sequences(input_tensor, maxlen=max_length_inp,
41+
padding='post')
42+
target_tensor= tf.keras.preprocessing.sequence.pad_sequences(target_tensor, maxlen=max_length_tar,
43+
padding='post')
44+
return input_tensor,input_tokenizer,target_tensor,target_tokenizer
45+
#定义word2number函数,通过对语料的处理提取词典,并进行word2number处理以及padding补全
46+
def tokenize(vocab_file):
47+
#从词典中读取预先生成tokenizer的config,构建词典矩阵
48+
with open(vocab_file,'r',encoding='utf-8') as f:
49+
tokenize_config=json.dumps(json.load(f),ensure_ascii=False)
50+
lang_tokenizer=tf.keras.preprocessing.text.tokenizer_from_json(tokenize_config)
51+
#利用词典进行word2number的转换以及padding处理
52+
return lang_tokenizer
53+
input_tensor, input_token, target_tensor, target_token = read_data(gConfig['seq_data'])
54+
steps_per_epoch = len(input_tensor) // gConfig['batch_size']
55+
BUFFER_SIZE = len(input_tensor)
56+
dataset = tf.data.Dataset.from_tensor_slices((input_tensor,target_tensor)).shuffle(BUFFER_SIZE)
57+
dataset = dataset.batch(BATCH_SIZE, drop_remainder=True)
58+
enc_hidden = seq2seqModel.encoder.initialize_hidden_state()
59+
#定义训练函数
60+
def train():
61+
# 从训练语料中读取数据并使用预生成词典word2number的转换
62+
print("Preparing data in %s" % gConfig['train_data'])
63+
print('每个epoch的训练步数: {}'.format(steps_per_epoch))
64+
#如有已经有预训练的模型则加载预训练模型继续训练
65+
checkpoint_dir = gConfig['model_data']
66+
ckpt=tf.io.gfile.listdir(checkpoint_dir)
67+
if ckpt:
68+
print("reload pretrained model")
69+
seq2seqModel.checkpoint.restore(tf.train.latest_checkpoint(checkpoint_dir))
70+
71+
#使用Dataset加载训练数据,Dataset可以加速数据的并发读取并进行训练效率的优化
72+
checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt")
73+
start_time = time.time()
74+
#current_loss=2
75+
#min_loss=gConfig['min_loss']
76+
epoch = 0
77+
train_epoch = gConfig['train_epoch']
78+
#开始进行循环训练,这里设置了一个结束循环的条件就是当loss小于设置的min_loss超参时终止训练
79+
while epoch<train_epoch:
80+
start_time_epoch = time.time()
81+
total_loss = 0
82+
#进行一个epoch的训练,训练的步数为steps_per_epoch
83+
for batch,(inp, targ) in enumerate(dataset.take(steps_per_epoch)):
84+
batch_loss = seq2seqModel.training_step(inp, targ,target_token, enc_hidden)
85+
total_loss += batch_loss
86+
print('epoch:{}batch:{} batch_loss: {}'.format(epoch,batch,batch_loss))
87+
#结束一个epoch的训练后,更新current_loss,计算在本epoch中每步训练平均耗时、loss值
88+
step_time_epoch = (time.time() - start_time_epoch) / steps_per_epoch
89+
step_loss = total_loss / steps_per_epoch
90+
current_steps = +steps_per_epoch
91+
epoch_time_total = (time.time() - start_time)
92+
print('训练总步数: {} 总耗时: {} epoch平均每步耗时: {} 平均每步loss {:.4f}'
93+
.format(current_steps, epoch_time_total, step_time_epoch, step_loss))
94+
#将本epoch训练的模型进行保存,更新模型文件
95+
seq2seqModel.checkpoint.save(file_prefix=checkpoint_prefix)
96+
sys.stdout.flush()
97+
epoch = epoch + 1
98+
with writer.as_default():
99+
tf.summary.scalar('loss', step_loss, step=epoch)
100+
#定义预测函数,用于根据上文预测下文对话
101+
def predict(sentence):
102+
# 从词典中读取预先生成tokenizer的config,构建词典矩阵
103+
input_tokenizer = tokenize(gConfig['vocab_inp_path'])
104+
target_tokenizer = tokenize(gConfig['vocab_tar_path'])
105+
#加载预训练的模型
106+
checkpoint_dir = gConfig['model_data']
107+
seq2seqModel.checkpoint.restore(tf.train.latest_checkpoint(checkpoint_dir))
108+
#对输入的语句进行处理,加上start end标示
109+
sentence = preprocess_sentence(sentence)
110+
#进行word2number的转换
111+
inputs = input_tokenizer.texts_to_sequences(sentence)
112+
#进行padding的补全
113+
inputs = tf.keras.preprocessing.sequence.pad_sequences([inputs],maxlen=max_length_inp,padding='post')
114+
inputs = tf.convert_to_tensor(inputs)
115+
result = ''
116+
#初始化一个中间状态
117+
hidden = [tf.zeros((1, units))]
118+
#对输入上文进行encoder编码,提取特征
119+
enc_out, enc_hidden = seq2seqModel.encoder(inputs, hidden)
120+
dec_hidden = enc_hidden
121+
#decoder的输入从start的对应Id开始正向输入
122+
dec_input = tf.expand_dims([target_tokenizer.word_index['start']], 0)
123+
#在最大的语句长度范围内容,使用模型中的decoder进行循环解码
124+
for t in range(max_length_tar):
125+
#获得解码结果,并使用argmax确定概率最大的id
126+
predictions, dec_hidden, attention_weights = seq2seqModel.decoder(dec_input, dec_hidden, enc_out)
127+
predicted_id = tf.argmax(predictions[0]).numpy()
128+
#判断当前Id是否为语句结束表示,如果是则停止循环解码,否则进行number2word的转换,并进行语句拼接
129+
if target_tokenizer.index_word[predicted_id] == 'end':
130+
break
131+
result += str(target_tokenizer.index_word[predicted_id]) + ' '
132+
#将预测得到的id作为下一个时刻的decoder的输入
133+
dec_input = tf.expand_dims([predicted_id], 0)
134+
return result
135+
#main函数的入口,根据超参设置的模式启动不同工作模式
136+
if __name__ == '__main__':
137+
#如果在启动python程序时指定了超参文件,则从超参文件中读取超参,否则从默认的超参文件中读取
138+
if len(sys.argv) - 1:
139+
gConfig = getConfig.get_config(sys.argv[1])
140+
else:
141+
gConfig = getConfig.get_config()
142+
print('\n>> 执行器模式 : %s\n' %(gConfig['mode']))
143+
if gConfig['mode'] == 'train':
144+
print('现在进行模型的训练')
145+
train()
146+
elif gConfig['mode'] == 'serve':
147+
print('当前为服务模式,请运行web程序,进行人机交互')
148+
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,117 @@
1+
# 导入依赖包
2+
import tensorflow as tf
3+
from config import getConfig
4+
tf.config.experimental_run_functions_eagerly(True)
5+
# 初始化超参字典
6+
gConfig = {}
7+
gConfig = getConfig.get_config()
8+
# 通过超参字典为vocab_inp_size、vocab_tar_size、embedding_dim、units等赋值
9+
vocab_inp_size = gConfig['vocab_inp_size']
10+
vocab_tar_size = gConfig['vocab_tar_size']
11+
embedding_dim = gConfig['embedding_dim']
12+
units = gConfig['layer_size']
13+
BATCH_SIZE=gConfig['batch_size']
14+
# 定义Encoder类
15+
class Encoder(tf.keras.Model):
16+
# 初始化函数,对默认参数进行初始化
17+
def __init__(self, vocab_size, embedding_dim, enc_units, batch_size):
18+
super(Encoder, self).__init__()
19+
self.enc_units = enc_units
20+
self.batch_size = batch_size
21+
self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)
22+
self.gru = tf.keras.layers.GRU(self.enc_units, return_sequences=True, return_state=True,
23+
recurrent_initializer='glorot_uniform')
24+
25+
# 定义调用函数,实现逻辑计算
26+
def call(self, x, hidden):
27+
x_emb = self.embedding(x)
28+
output, state = self.gru(x_emb, initial_state=hidden)
29+
return output, state
30+
def initialize_hidden_state(self):
31+
return tf.zeros((self.batch_size, self.enc_units))
32+
# 定义bahdanauAttention类,bahdanauAttention是常用的attention实现方法之一
33+
class BahdanauAttention(tf.keras.Model):
34+
def __init__(self, units):
35+
super(BahdanauAttention, self).__init__()
36+
# 注意力网络的初始化
37+
self.W1 = tf.keras.layers.Dense(units)
38+
self.W2 = tf.keras.layers.Dense(units)
39+
self.V = tf.keras.layers.Dense(1)
40+
41+
def call(self, query, values):
42+
# 将query增加一个维度,以便可以与values进行线性相加
43+
hidden_with_time_axis = tf.expand_dims(query, 1)
44+
# 将quales与hidden_with_time_axis进行线性相加后,使用tanh进行非线性变换,最后输出一维的score
45+
score = self.V(tf.nn.tanh(
46+
self.W1(values) + self.W2(hidden_with_time_axis)))
47+
# 使用softmax将score进行概率化转换,转为为概率空间
48+
attention_weights = tf.nn.softmax(score, axis=1)
49+
# 将权重与values(encoder_out)进行相乘,得到context_vector
50+
context_vector = attention_weights * values
51+
# 将乘机后的context_vector按行相加,进行压缩得到最终的context_vector
52+
context_vector = tf.reduce_sum(context_vector, axis=1)
53+
return context_vector, attention_weights
54+
55+
56+
class Decoder(tf.keras.Model):
57+
def __init__(self, vocab_size, embedding_dim, dec_units, batch_sz):
58+
super(Decoder, self).__init__()
59+
# 初始化batch_sz、dec_units、embedding 、gru 、fc、attention
60+
self.batch_sz = batch_sz
61+
self.dec_units = dec_units
62+
self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)
63+
self.gru = tf.keras.layers.GRU(self.dec_units,
64+
return_sequences=True,
65+
return_state=True,
66+
recurrent_initializer='glorot_uniform')
67+
self.fc = tf.keras.layers.Dense(vocab_size)
68+
self.attention = BahdanauAttention(self.dec_units)
69+
70+
def call(self, y, hidden, enc_output):
71+
# 首先对enc_output、以及decoder的hidden计算attention,输出上下文语境向量
72+
context_vector, attention_weights = self.attention(hidden, enc_output)
73+
# 对decoder的输入进行embedding
74+
y = self.embedding(y)
75+
# 拼接上下文语境与decoder的输入embedding,并送入gru中
76+
y = tf.concat([tf.expand_dims(context_vector, 1), y], axis=-1)
77+
output, state = self.gru(y)
78+
# 将gru的输出进行维度转换,送入全连接神经网络 得到最后的结果
79+
output = tf.reshape(output, (-1, output.shape[2]))
80+
y = self.fc(output)
81+
return y, state, attention_weights
82+
def initialize_hidden_state(self):
83+
return tf.zeros((self.batch_size, self.dec_units))
84+
# 定义损失函数
85+
def loss_function(real, pred):
86+
loss_object = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
87+
# mask掉start,去除start对于loss的干扰
88+
mask = tf.math.logical_not(tf.math.equal(real, 0))
89+
loss_ = loss_object(real, pred)
90+
mask = tf.cast(mask, dtype=loss_.dtype) # 将bool型转换成数值
91+
loss_ *= mask
92+
return tf.reduce_mean(loss_)
93+
94+
# 实例化encoder、decoder、optimizer、checkpoint等
95+
encoder = Encoder(vocab_inp_size, embedding_dim, units, BATCH_SIZE)
96+
decoder = Decoder(vocab_tar_size, embedding_dim, units, BATCH_SIZE)
97+
optimizer = tf.keras.optimizers.Adam()
98+
checkpoint = tf.train.Checkpoint(optimizer=optimizer, encoder=encoder, decoder=decoder)
99+
@tf.function
100+
def training_step(inp, targ, targ_lang, enc_hidden):
101+
loss = 0
102+
with tf.GradientTape() as tape:
103+
enc_output, enc_hidden = encoder(inp, enc_hidden)
104+
dec_hidden = enc_hidden
105+
dec_input = tf.expand_dims([targ_lang.word_index['start']] * BATCH_SIZE, 1)
106+
for t in range(1, targ.shape[1]):
107+
predictions, dec_hidden, _ = decoder(dec_input, dec_hidden, enc_output)
108+
loss += loss_function(targ[:, t], predictions)
109+
dec_input = tf.expand_dims(targ[:, t], 1)
110+
step_loss = (loss / int(targ.shape[1]))
111+
variables = encoder.trainable_variables + decoder.trainable_variables
112+
gradients = tape.gradient(loss, variables)
113+
optimizer.apply_gradients(zip(gradients, variables))
114+
return step_loss
115+
116+
117+

Chatbot-tensowflow2.0/Seq2seqchatbot/static/css/normalize.css

+1
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

0 commit comments

Comments
 (0)