glm-ms
/
test.py

# Copyright 2023 Huawei Technologies Co., Ltd
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ============================================================================

import argparse
import re
import mindspore as ms
from mindspore import ops, Tensor, nn
from mindspore.ops import operations as P
from mindspore.nn.transformer.transformer import AttentionMask
from tokenization import get_tokenizer

parser = argparse.ArgumentParser()

parser.add_argument('--tokenizer_type', default="icetk-glm-130B")

args_ = parser.parse_args()
def isEnglish(s):
    try:
        s.encode(encoding="utf-8").decode("ascii")
    except UnicodeDecodeError:
        return False
    else:
        return True


def get_masks_and_position_ids(seq, mask_position, max_gen_length, gmask=False):
    add = ops.Add()
    pad_value_opposite = Tensor(1, ms.int32)
    pad_value = Tensor(-1, ms.int32)
    seq = add(pad_value_opposite, seq)
    pad_op = nn.Pad(paddings=((0, 0), (0, max_gen_length)))
    tokens = pad_op(seq)
    tokens = add(tokens, pad_value)
    context_length = seq.shape[1]

    ones = ops.Ones()
    att_inputs = ones((1, tokens.shape[-1]), ms.float32)
    get_attention_mask = AttentionMask(seq_length=tokens.shape[-1])
    attention_mask = get_attention_mask(att_inputs)
    attention_mask[..., : context_length - 1] = 1
    attention_mask = ops.expand_dims(attention_mask, axis=1)
    less = ops.Less()
    attention_mask = less(attention_mask, 0.5)

    position_ids = ms.numpy.arange(tokens.shape[-1], dtype=ms.int32)
    if not gmask:
        position_ids[context_length - 1:] = mask_position

    position_ids = ops.expand_dims(position_ids, axis=0)

    return tokens, attention_mask, position_ids


def fill_blanks(raw_text, tokenizer):
    # add MASK
    generation_mask = "[gMASK]"
    if "[MASK]" in raw_text:
        generation_mask = "[MASK]"
    elif "[sMASK]" in raw_text:
        generation_mask = "[sMASK]"
    use_gmask = "[MASK]" not in raw_text and "[sMASK]" not in raw_text

    mask_pattern = r"\[[sg]?MASK\]"
    text_list = re.split(mask_pattern, raw_text)
    pattern_list = re.compile(mask_pattern).findall(raw_text)
    seq = []
    for i in range(len(pattern_list)):
        pattern = pattern_list[i]
        sub_text = text_list[i]
        seq.extend(tokenizer.tokenize(sub_text))
        seq.append(tokenizer.get_command(pattern))

    seq.extend(tokenizer.tokenize(text_list[-1]))

    if "MASK]" not in raw_text:
        seq += [tokenizer.get_command(generation_mask)]
        raw_text += " " + generation_mask
    if not raw_text.endswith("MASK]"):
        seq = seq + [tokenizer.get_command("eos")]

    print("\nInput: {}\n".format(raw_text))
    if len(seq) > 100:
        raise ValueError("text too long.")

    # generation
    is_english = isEnglish(raw_text)
    output_list = [seq]
    num_output = 1
    last_pos, answers, answers_with_style, blanks = (
        [0] * num_output,
        ["" for _ in range(num_output)],
        ["" for _ in range(num_output)],
        [[] for _ in range(num_output)],
    )

    # continually detect the first mark position
    cast = P.Cast()
    while True:
        seq = output_list[0]
        # detect mask position
        mask_token = tokenizer.get_command(generation_mask)
        if mask_token not in seq:
            break
        mask_position = seq.index(mask_token)

        output_list = []

        input_seq = Tensor([seq + [tokenizer.get_command("sop")]])
        input_seq = cast(input_seq, ms.int64)
        get_masks_and_position_ids(input_seq, mask_position, 20, use_gmask)

tokenizer = get_tokenizer(args_)
print(tokenizer.tokenize("I love Beijing."))
fill_blanks("I love [MASK] Beijing.", tokenizer)