1 Star 1 Fork 0

庵中十三居士/探索无尽可能:我的LLM(小语言模型)

加入 Gitee
与超过 1200万 开发者一起发现、参与优秀开源项目,私有仓库也完全免费 :)
免费加入
克隆/下载
dataset.py 2.04 KB
一键复制 编辑 原始数据 按行查看 历史
庵中十三居士 提交于 2024-01-07 16:39 . 提交代码
from datasets import load_dataset
from torch.utils.data import Dataset as BaseDataset
import torch
from chat_models import Mirostat
from typing import Optional, List
import re
class ByteTokenizer(object):
vocab_size: int = 256
def from_pretrained(**args):
return ByteTokenizer()
def encode(self, inputString: str) -> List[int]:
return [code for code in inputString.encode('utf-8')]
def decode(self, tokens: List[int]) -> str:
return bytes(tokens).decode('utf-8', errors='ignore')
class Dataset(BaseDataset):
def __init__(self, file: str):
self.ctx = 64
self.squad_it_dataset = load_dataset("json", data_files=file)
self.tokenizer = ByteTokenizer.from_pretrained()
self.vocabSize = self.tokenizer.vocab_size
self._role = re.compile(r'[\r|\n]+')
def __len__(self) -> int:
return len(self.squad_it_dataset['train'])
def __getitem__(self, idx) -> tuple:
item = self.squad_it_dataset['train'][idx]
tokens = self.tokenizer.encode('Instruction:\n' + re.sub(self._role, '\n', item['instruction']).strip() + '\n\n')
tokens = tokens + self.tokenizer.encode('Input:\n' + re.sub(self._role, '\n', item['input']).strip() + '\n\n')
tokens = tokens + self.tokenizer.encode('Output:\n' + re.sub(self._role, '\n', item['output']).strip() + '\n\n')
x = torch.zeros(len(tokens) - 1, self.tokenizer.vocab_size)
y = torch.zeros(len(tokens) - 1, self.tokenizer.vocab_size)
for i in range(len(tokens) - 1):
x[i][tokens[i]] = 1.
y[i][tokens[i + 1]] = 1.
return x, y
if __name__ == '__main__':
data = Dataset("E:\\belle\\school_math_0.25M.json")
m = Mirostat(0.1)
tokenizer = ByteTokenizer.from_pretrained()
for idx, (item) in enumerate(data):
if idx > 3:
break
x, y = item
print(x)
r = x.size()[0]
tks = []
for i in range(r):
out = x[i]
tk = m.choise(out)
tks.append(tk)
print(tokenizer.decode(tks))
马建仓 AI 助手
尝试更多
代码解读
代码找茬
代码优化
Python
1
https://gitee.com/az13js/my_llm.git
git@gitee.com:az13js/my_llm.git
az13js
my_llm
探索无尽可能:我的LLM(小语言模型)
master

搜索帮助