代码拉取完成,页面将自动刷新
同步操作将从 Pasca/bert_encoder 强制同步,此操作会覆盖自 Fork 仓库以来所做的任何修改,且无法恢复!!!
确定后同步将在后台操作,完成时将刷新页面,请耐心等待。
# -*- coding:utf-8 -*-
import os
from bert import modeling
import tensorflow as tf
from bert import tokenization
flags = tf.flags
FLAGS = flags.FLAGS
bert_path = r'chinese_L-12_H-768_A-12'
root_path = os.getcwd()
flags.DEFINE_string(
"bert_config_file", os.path.join(bert_path, 'bert_config.json'),
"The config json file corresponding to the pre-trained BERT model."
)
flags.DEFINE_string("vocab_file", os.path.join(bert_path, 'vocab.txt'),
"The vocabulary file that the BERT model was trained on.")
flags.DEFINE_bool(
"do_lower_case", True,
"Whether to lower case the input text."
)
flags.DEFINE_integer(
"max_seq_length", 128,
"The maximum total input sequence length after WordPiece tokenization."
)
bert_config = modeling.BertConfig.from_json_file(FLAGS.bert_config_file)
tokenizer = tokenization.FullTokenizer(
vocab_file=FLAGS.vocab_file, do_lower_case=FLAGS.do_lower_case)
def data_preprocess(sentence):
tokens = []
for i, word in enumerate(sentence):
# 分词,如果是中文,就是分字
token = tokenizer.tokenize(word)
tokens.extend(token)
# 序列截断
if len(tokens) >= FLAGS.max_seq_length - 1:
tokens = tokens[0:(FLAGS.max_seq_length - 2)] # -2 的原因是因为序列需要加一个句首和句尾标志
ntokens = []
segment_ids = []
ntokens.append("[CLS]") # 句子开始设置CLS 标志
segment_ids.append(0)
# append("O") or append("[CLS]") not sure!
for i, token in enumerate(tokens):
ntokens.append(token)
segment_ids.append(0)
ntokens.append("[SEP]") # 句尾添加[SEP] 标志
segment_ids.append(0)
# append("O") or append("[SEP]") not sure!
input_ids = tokenizer.convert_tokens_to_ids(ntokens) # 将序列中的字(ntokens)转化为ID形式
# print(input_ids)
input_mask = [1] * len(input_ids)
# print(input_mask)
while len(input_ids) < FLAGS.max_seq_length:
input_ids.append(0)
input_mask.append(0)
input_ids = [input_ids]
return input_ids, input_mask
class BertEncoder(object):
def __init__(self):
self.bert_model = modeling.BertModel(config=bert_config, is_training=False, max_seq_length=FLAGS.max_seq_length)
tvars = tf.trainable_variables()
(assignment_map, initialized_variable_names) = modeling.get_assignment_map_from_checkpoint(tvars, FLAGS.init_cheeckpoint)
tf.train.init_from_checkpoint(FLAGS.init_cheeckpoint, assignment_map)
self.sess = tf.Session()
self.sess.run(tf.global_variables_initializer())
def encode(self, sentence):
input_ids, input_mask = data_preprocess(sentence)
return self.sess.run(self.bert_model.embedding_output, feed_dict={self.bert_model.input_ids:input_ids})
if __name__ == "__main__":
be = BertEncoder()
embedding = be.encode("新年快乐,恭喜发财,万事如意!")
print(embedding)
print(embedding.shape)
此处可能存在不合适展示的内容,页面不予展示。您可通过相关编辑功能自查并修改。
如您确认内容无涉及 不当用语 / 纯广告导流 / 暴力 / 低俗色情 / 侵权 / 盗版 / 虚假 / 无价值内容或违法国家有关法律法规的内容,可点击提交进行申诉,我们将尽快为您处理。