代码拉取完成,页面将自动刷新
import os
import shutil
from langchain.vectorstores import ElasticKnnSearch
from langchain.document_loaders import TextLoader
from langchain.text_splitter import CharacterTextSplitter
from configs.params import ESParams
from embedding import Embeddings
from langchain.vectorstores import Chroma
from embedding import Embeddings
CUR_DIR = os.path.dirname(__file__)
def load_file(filepath, chunk_size, chunk_overlap):
loader = TextLoader(filepath, encoding='utf-8')
documents = loader.load()
text_splitter = CharacterTextSplitter(separator='\n', chunk_size=chunk_size, chunk_overlap=chunk_overlap)
docs = text_splitter.split_documents(documents)
return docs
persist_directory = os.path.join(CUR_DIR, './data/vectorbase')
class VectorSearch:
def __init__(self, embedding_model_path, chunk_size=500, chunk_overlap=0) -> None:
self.embeddings = Embeddings(embedding_model_path) # 向量模型
# self.text_splitter = CharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
print(os.path.abspath(persist_directory))
self.vectorbase = Chroma("MyKonwledge", self.embeddings, persist_directory=persist_directory)
def doc_upload(self, file_obj, chunk_size, chunk_overlap):
try:
# if not self.client.indices.exists(index=self.es_params.index_name):
# dims = len(self.embedding.embed_query("test"))
# mapping = _default_knn_mapping(dims)
# self.client.indices.create(index=self.es_params.index_name, body={"mappings": mapping})
filename = os.path.split(file_obj.name)[-1]
file_path = 'data/' + filename
find = self.vectorbase.get(where={'source': file_path})
if len(find['ids']) > 0:
return "文件[ {} ]已经存在".format(filename)
shutil.move(file_obj.name, file_path)
docs = load_file(file_path, chunk_size, chunk_overlap)
self.vectorbase.add_documents(docs) # 修改成添加到vectorbase
return "插入成功"
except Exception as e:
return e
# def _splitDocument(self, texts):
def doc_upload_test(self, chunk_size=300, chunk_overlap=0):
filename = 'case.txt'
file_path = 'data/' + filename
find = self.vectorbase.get(where={'source': file_path})
# print(find)
if len(find['ids']) > 0:
return "文件[ {} ]已经存在".format(filename)
docs = load_file(file_path, chunk_size, chunk_overlap)
self.vectorbase.add_documents(docs) # 修改成添加到vectorbase
def doc_search(self, method, query, top_k, knn_boost):
result = []
docs = self.vectorbase.similarity_search(query)
for doc in docs:
doc = dict(doc)
result.append({
'content': doc['page_content'],
'title': doc['metadata']['source'][5:]
})
return result
if __name__ == "__main__":
es = VectorSearch("./model/m3e")
res = es.doc_upload_test()
print(res)
print(es.doc_search("精确查询", "工程伦理", 3, 0.5))
此处可能存在不合适展示的内容,页面不予展示。您可通过相关编辑功能自查并修改。
如您确认内容无涉及 不当用语 / 纯广告导流 / 暴力 / 低俗色情 / 侵权 / 盗版 / 虚假 / 无价值内容或违法国家有关法律法规的内容,可点击提交进行申诉,我们将尽快为您处理。