5 Star 0 Fork 0

赵天祥/python-project

加入 Gitee
与超过 1200万 开发者一起发现、参与优秀开源项目,私有仓库也完全免费 :)
免费加入
该仓库未声明开源许可证文件(LICENSE),使用请关注具体项目描述及其代码上游依赖。
克隆/下载
doc_search_memory.py 3.11 KB
一键复制 编辑 原始数据 按行查看 历史
jack 提交于 2023-11-22 01:02 . modify the infomation
import os
import shutil
from langchain.vectorstores import ElasticKnnSearch
from langchain.document_loaders import TextLoader
from langchain.text_splitter import CharacterTextSplitter
from configs.params import ESParams
from embedding import Embeddings
from langchain.vectorstores import Chroma
from embedding import Embeddings
CUR_DIR = os.path.dirname(__file__)
def load_file(filepath, chunk_size, chunk_overlap):
loader = TextLoader(filepath, encoding='utf-8')
documents = loader.load()
text_splitter = CharacterTextSplitter(separator='\n', chunk_size=chunk_size, chunk_overlap=chunk_overlap)
docs = text_splitter.split_documents(documents)
return docs
persist_directory = os.path.join(CUR_DIR, './data/vectorbase')
class VectorSearch:
def __init__(self, embedding_model_path, chunk_size=500, chunk_overlap=0) -> None:
self.embeddings = Embeddings(embedding_model_path) # 向量模型
# self.text_splitter = CharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
print(os.path.abspath(persist_directory))
self.vectorbase = Chroma("MyKonwledge", self.embeddings, persist_directory=persist_directory)
def doc_upload(self, file_obj, chunk_size, chunk_overlap):
try:
# if not self.client.indices.exists(index=self.es_params.index_name):
# dims = len(self.embedding.embed_query("test"))
# mapping = _default_knn_mapping(dims)
# self.client.indices.create(index=self.es_params.index_name, body={"mappings": mapping})
filename = os.path.split(file_obj.name)[-1]
file_path = 'data/' + filename
find = self.vectorbase.get(where={'source': file_path})
if len(find['ids']) > 0:
return "文件[ {} ]已经存在".format(filename)
shutil.move(file_obj.name, file_path)
docs = load_file(file_path, chunk_size, chunk_overlap)
self.vectorbase.add_documents(docs) # 修改成添加到vectorbase
return "插入成功"
except Exception as e:
return e
# def _splitDocument(self, texts):
def doc_upload_test(self, chunk_size=300, chunk_overlap=0):
filename = 'case.txt'
file_path = 'data/' + filename
find = self.vectorbase.get(where={'source': file_path})
# print(find)
if len(find['ids']) > 0:
return "文件[ {} ]已经存在".format(filename)
docs = load_file(file_path, chunk_size, chunk_overlap)
self.vectorbase.add_documents(docs) # 修改成添加到vectorbase
def doc_search(self, method, query, top_k, knn_boost):
result = []
docs = self.vectorbase.similarity_search(query)
for doc in docs:
doc = dict(doc)
result.append({
'content': doc['page_content'],
'title': doc['metadata']['source'][5:]
})
return result
if __name__ == "__main__":
es = VectorSearch("./model/m3e")
res = es.doc_upload_test()
print(res)
print(es.doc_search("精确查询", "工程伦理", 3, 0.5))
马建仓 AI 助手
尝试更多
代码解读
代码找茬
代码优化
1
https://gitee.com/lzu_ztx/python-project.git
git@gitee.com:lzu_ztx/python-project.git
lzu_ztx
python-project
python-project
master

搜索帮助