1 Star 0 Fork 0

xqy2006/llama2.c-zh

加入 Gitee
与超过 1200万 开发者一起发现、参与优秀开源项目,私有仓库也完全免费 :)
免费加入
该仓库未声明开源许可证文件(LICENSE),使用请关注具体项目描述及其代码上游依赖。
克隆/下载
tokenizer.py 2.63 KB
一键复制 编辑 原始数据 按行查看 历史
chenyang 提交于 2023-08-14 14:01 . [v0.0.1]: init
# Taken from llama code and lightly modified
# Copyright (c) Meta Platforms, Inc. and affiliates.
# This software may be used and distributed according to the terms of the Llama 2 Community License Agreement.
import os
import struct
from logging import getLogger
from typing import List
from sentencepiece import SentencePieceProcessor
from config import TOKENIZER_BIN, TOKENIZER_MODEL
class Tokenizer:
def __init__(self):
model_path = TOKENIZER_MODEL
assert os.path.isfile(model_path), model_path
self.sp_model = SentencePieceProcessor(model_file=model_path)
print(f"Loaded SentencePiece model from {model_path}")
# BOS / EOS token IDs
self.n_words: int = self.sp_model.vocab_size()
self.bos_id: int = self.sp_model.bos_id()
self.eos_id: int = self.sp_model.eos_id()
self.pad_id: int = self.sp_model.pad_id()
print(f"#words: {self.n_words} - BOS ID: {self.bos_id} - EOS ID: {self.eos_id}")
assert self.sp_model.vocab_size() == self.sp_model.get_piece_size()
def encode(self, s: str, bos: bool, eos: bool) -> List[int]:
assert type(s) is str
t = self.sp_model.encode(s)
if bos:
t = [self.bos_id] + t
if eos:
t = t + [self.eos_id]
return t
def decode(self, t: List[int]) -> str:
return self.sp_model.decode(t)
def export(self):
# get all the tokens (postprocessed) and their scores as floats
tokens, scores = [], []
for i in range(self.n_words):
# decode the token and light postprocessing
t = self.sp_model.id_to_piece(i)
s = self.sp_model.get_score(i)
if i == self.bos_id:
t = '\n<s>\n'
elif i == self.eos_id:
t = '\n</s>\n'
elif len(t) == 6 and t.startswith('<0x') and t.endswith('>'):
t = chr(int(t[3:5], 16)) # e.g. make '<0x01>' into '\x01'
t = t.replace('▁', ' ') # sentencepiece uses this character as whitespace
b = t.encode('utf-8') # bytes of this token, utf-8 encoded
tokens.append(b)
scores.append(s)
# record the max token length
max_token_length = max(len(t) for t in tokens)
# write to a binary file
with open(TOKENIZER_BIN, 'wb') as f:
f.write(struct.pack("I", max_token_length))
for bytes, score in zip(tokens, scores):
f.write(struct.pack("fI", score, len(bytes)))
f.write(bytes)
print(f"TOKENIZER_BIN is saved = {TOKENIZER_BIN}")
if __name__ == "__main__":
t = Tokenizer()
t.export()
马建仓 AI 助手
尝试更多
代码解读
代码找茬
代码优化
1
https://gitee.com/kilszfdjs/llama2.c-zh.git
git@gitee.com:kilszfdjs/llama2.c-zh.git
kilszfdjs
llama2.c-zh
llama2.c-zh
main

搜索帮助