1 Star 0 Fork 1

陈旭亮/bert_encoder

forked from Pasca/bert_encoder 
加入 Gitee
与超过 1200万 开发者一起发现、参与优秀开源项目,私有仓库也完全免费 :)
免费加入
该仓库未声明开源许可证文件(LICENSE),使用请关注具体项目描述及其代码上游依赖。
克隆/下载
bert_encoder.py 2.91 KB
一键复制 编辑 原始数据 按行查看 历史
lzphahaha 提交于 2020-03-11 10:52 . fix
# -*- coding:utf-8 -*-
import os
from bert import modeling
import tensorflow as tf
from bert import tokenization
flags = tf.flags
FLAGS = flags.FLAGS
bert_path = r'chinese_L-12_H-768_A-12'
root_path = os.getcwd()
flags.DEFINE_string(
"bert_config_file", os.path.join(bert_path, 'bert_config.json'),
"The config json file corresponding to the pre-trained BERT model."
)
flags.DEFINE_string("vocab_file", os.path.join(bert_path, 'vocab.txt'),
"The vocabulary file that the BERT model was trained on.")
flags.DEFINE_bool(
"do_lower_case", True,
"Whether to lower case the input text."
)
flags.DEFINE_integer(
"max_seq_length", 128,
"The maximum total input sequence length after WordPiece tokenization."
)
bert_config = modeling.BertConfig.from_json_file(FLAGS.bert_config_file)
tokenizer = tokenization.FullTokenizer(
vocab_file=FLAGS.vocab_file, do_lower_case=FLAGS.do_lower_case)
def data_preprocess(sentence):
tokens = []
for i, word in enumerate(sentence):
# 分词,如果是中文,就是分字
token = tokenizer.tokenize(word)
tokens.extend(token)
# 序列截断
if len(tokens) >= FLAGS.max_seq_length - 1:
tokens = tokens[0:(FLAGS.max_seq_length - 2)] # -2 的原因是因为序列需要加一个句首和句尾标志
ntokens = []
segment_ids = []
ntokens.append("[CLS]") # 句子开始设置CLS 标志
segment_ids.append(0)
# append("O") or append("[CLS]") not sure!
for i, token in enumerate(tokens):
ntokens.append(token)
segment_ids.append(0)
ntokens.append("[SEP]") # 句尾添加[SEP] 标志
segment_ids.append(0)
# append("O") or append("[SEP]") not sure!
input_ids = tokenizer.convert_tokens_to_ids(ntokens) # 将序列中的字(ntokens)转化为ID形式
# print(input_ids)
input_mask = [1] * len(input_ids)
# print(input_mask)
while len(input_ids) < FLAGS.max_seq_length:
input_ids.append(0)
input_mask.append(0)
input_ids = [input_ids]
return input_ids, input_mask
class BertEncoder(object):
def __init__(self):
self.bert_model = modeling.BertModel(config=bert_config, is_training=False, max_seq_length=FLAGS.max_seq_length)
tvars = tf.trainable_variables()
(assignment_map, initialized_variable_names) = modeling.get_assignment_map_from_checkpoint(tvars, FLAGS.init_cheeckpoint)
tf.train.init_from_checkpoint(FLAGS.init_cheeckpoint, assignment_map)
self.sess = tf.Session()
self.sess.run(tf.global_variables_initializer())
def encode(self, sentence):
input_ids, input_mask = data_preprocess(sentence)
return self.sess.run(self.bert_model.embedding_output, feed_dict={self.bert_model.input_ids:input_ids})
if __name__ == "__main__":
be = BertEncoder()
embedding = be.encode("新年快乐,恭喜发财,万事如意!")
print(embedding)
print(embedding.shape)
马建仓 AI 助手
尝试更多
代码解读
代码找茬
代码优化
1
https://gitee.com/cxl0924/bert_encoder.git
git@gitee.com:cxl0924/bert_encoder.git
cxl0924
bert_encoder
bert_encoder
master

搜索帮助