1 Star 0 Fork 0

匿名者/项目boost搜索引擎

加入 Gitee
与超过 1200万 开发者一起发现、参与优秀开源项目,私有仓库也完全免费 :)
免费加入
文件
该仓库未声明开源许可证文件(LICENSE),使用请关注具体项目描述及其代码上游依赖。
克隆/下载
index.hpp 5.04 KB
一键复制 编辑 原始数据 按行查看 历史
匿名者 提交于 2024-09-17 19:38 . 索引模块
#pragma once
#include<mutex>
#include <iostream>
#include <string>
#include <vector>
#include <unordered_map>
#include <fstream>
#include "tool.hpp"
#include "log.hpp"
using namespace std;
namespace project_index
{
typedef struct format
{
string title;
string url;
string content;
uint64_t docid;//文档id
}Format;
typedef struct Inverted_zipper//倒排拉链
{
uint64_t docid;//文档id
string keyword;//关键词
int weight;//权重
Inverted_zipper()
:weight(0){}
}Inverted_zipper;
class index
{
private:
vector<Format> Front_index;//正排索引 下标模拟文档id
unordered_map<string,vector<Inverted_zipper>> inverted_index;//倒排 关键词与多个(一个)倒排拉链的对应
static index * Index;
static mutex mtx;
index(const index &)=delete;
index& operator=(const index&)=delete;
index()
{}
public:
~index()
{}
static index* GetIndex()
{
if(nullptr == Index)
{
mtx.lock();
if(nullptr == Index){
Index = new index();
}
mtx.unlock();
}
return Index;
}
//id获得文档内容
Format* GetFront_index(uint64_t docid)
{
if(docid>=Front_index.size())
{
LOG(Warning,"docid>=Front_index.size");
return nullptr;
}
return &Front_index[docid];
}
//关键词获得倒排拉链
vector<Inverted_zipper>* Getinverted_index(const string &keyword)
{
auto it = inverted_index.find(keyword);
if(it == inverted_index.end())
{
LOG(Warning,"keyword find Warning");
return nullptr;
}
return &(it->second);
}
//建立索引 数据源:parser处理完的数据
bool Establish_index(const string &raw)
{
ifstream in(raw,ios::in | ios::binary);
if(!in.is_open())
{
LOG(Warning,"in.is_open Warning");
return false;
}
string temp;
int count =0;
while(getline(in,temp))
{
Format* doc = Establish_Front_index(temp);//建立正排索引
if(doc == nullptr)
{
LOG(Warning,"Establish_Front_index warning");
continue;
}
/*if(doc->docid == 5008)
{
cout << "建立5008倒排索引" << endl;
}*/
bool flag = Establish_inverted_index(*doc);//建立倒排索引
count++;
LOG(Info,"当前已经建立索引的文档 :" + to_string(count));
}
return true;
}
private:
Format* Establish_Front_index(string &temp)
{
//切分temp
vector<string> result;
string sep = "\3";
bool flag = project_tool::stringtool::Slice_strings(temp,&result,sep);
if(!flag)
{
LOG(Warning,"Slice_strings WARNING");
return nullptr;
}
//切分好后放到Format
Format doc;
if(result.size() != 3)
{
LOG(Warning,"Slice_strings WARNING");
return nullptr;
}
doc.title = result[0];
doc.content = result[1];
doc.url = result[2];
//doc.docid = result.size(); //id为vector下标
doc.docid = Front_index.size();
//结果插入正排索引
Front_index.push_back(move(doc));//move性能优化
return &Front_index.back();
}
bool Establish_inverted_index(Format &doc)//建立倒排
{
struct word_count
{
int title_count;
int content_count;
word_count():title_count(0),content_count(0){}
};
vector<string> title_result;
project_tool::jiebatool::CutString(doc.title,&title_result);
unordered_map<string,word_count> word_map;
for(string &s:title_result)
{
boost::to_lower(s);
word_map[s].title_count++;
}
vector<string> content_result;
project_tool::jiebatool::CutString(doc.content,&content_result);
for(string & s : content_result)
{
boost::to_lower(s);
word_map[s].content_count++;
}
const int title_corr = 10;
for(auto &iter : word_map)
{
Inverted_zipper temp;
temp.docid = doc.docid;
temp.keyword = iter.first;
temp.weight = title_corr * (iter.second.title_count)+ iter.second.content_count;
vector<Inverted_zipper> &vector_temp = inverted_index[iter.first];
vector_temp.push_back(move(temp));
}
return true;
}
};
index * index::Index = nullptr;
mutex index::mtx;
}
马建仓 AI 助手
尝试更多
代码解读
代码找茬
代码优化
1
https://gitee.com/adexiur/project-boost-search-engine.git
git@gitee.com:adexiur/project-boost-search-engine.git
adexiur
project-boost-search-engine
项目boost搜索引擎
master

搜索帮助