代码拉取完成,页面将自动刷新
#pragma once
#include<mutex>
#include <iostream>
#include <string>
#include <vector>
#include <unordered_map>
#include <fstream>
#include "tool.hpp"
#include "log.hpp"
using namespace std;
namespace project_index
{
typedef struct format
{
string title;
string url;
string content;
uint64_t docid;//文档id
}Format;
typedef struct Inverted_zipper//倒排拉链
{
uint64_t docid;//文档id
string keyword;//关键词
int weight;//权重
Inverted_zipper()
:weight(0){}
}Inverted_zipper;
class index
{
private:
vector<Format> Front_index;//正排索引 下标模拟文档id
unordered_map<string,vector<Inverted_zipper>> inverted_index;//倒排 关键词与多个(一个)倒排拉链的对应
static index * Index;
static mutex mtx;
index(const index &)=delete;
index& operator=(const index&)=delete;
index()
{}
public:
~index()
{}
static index* GetIndex()
{
if(nullptr == Index)
{
mtx.lock();
if(nullptr == Index){
Index = new index();
}
mtx.unlock();
}
return Index;
}
//id获得文档内容
Format* GetFront_index(uint64_t docid)
{
if(docid>=Front_index.size())
{
LOG(Warning,"docid>=Front_index.size");
return nullptr;
}
return &Front_index[docid];
}
//关键词获得倒排拉链
vector<Inverted_zipper>* Getinverted_index(const string &keyword)
{
auto it = inverted_index.find(keyword);
if(it == inverted_index.end())
{
LOG(Warning,"keyword find Warning");
return nullptr;
}
return &(it->second);
}
//建立索引 数据源:parser处理完的数据
bool Establish_index(const string &raw)
{
ifstream in(raw,ios::in | ios::binary);
if(!in.is_open())
{
LOG(Warning,"in.is_open Warning");
return false;
}
string temp;
int count =0;
while(getline(in,temp))
{
Format* doc = Establish_Front_index(temp);//建立正排索引
if(doc == nullptr)
{
LOG(Warning,"Establish_Front_index warning");
continue;
}
/*if(doc->docid == 5008)
{
cout << "建立5008倒排索引" << endl;
}*/
bool flag = Establish_inverted_index(*doc);//建立倒排索引
count++;
LOG(Info,"当前已经建立索引的文档 :" + to_string(count));
}
return true;
}
private:
Format* Establish_Front_index(string &temp)
{
//切分temp
vector<string> result;
string sep = "\3";
bool flag = project_tool::stringtool::Slice_strings(temp,&result,sep);
if(!flag)
{
LOG(Warning,"Slice_strings WARNING");
return nullptr;
}
//切分好后放到Format
Format doc;
if(result.size() != 3)
{
LOG(Warning,"Slice_strings WARNING");
return nullptr;
}
doc.title = result[0];
doc.content = result[1];
doc.url = result[2];
//doc.docid = result.size(); //id为vector下标
doc.docid = Front_index.size();
//结果插入正排索引
Front_index.push_back(move(doc));//move性能优化
return &Front_index.back();
}
bool Establish_inverted_index(Format &doc)//建立倒排
{
struct word_count
{
int title_count;
int content_count;
word_count():title_count(0),content_count(0){}
};
vector<string> title_result;
project_tool::jiebatool::CutString(doc.title,&title_result);
unordered_map<string,word_count> word_map;
for(string &s:title_result)
{
boost::to_lower(s);
word_map[s].title_count++;
}
vector<string> content_result;
project_tool::jiebatool::CutString(doc.content,&content_result);
for(string & s : content_result)
{
boost::to_lower(s);
word_map[s].content_count++;
}
const int title_corr = 10;
for(auto &iter : word_map)
{
Inverted_zipper temp;
temp.docid = doc.docid;
temp.keyword = iter.first;
temp.weight = title_corr * (iter.second.title_count)+ iter.second.content_count;
vector<Inverted_zipper> &vector_temp = inverted_index[iter.first];
vector_temp.push_back(move(temp));
}
return true;
}
};
index * index::Index = nullptr;
mutex index::mtx;
}
此处可能存在不合适展示的内容,页面不予展示。您可通过相关编辑功能自查并修改。
如您确认内容无涉及 不当用语 / 纯广告导流 / 暴力 / 低俗色情 / 侵权 / 盗版 / 虚假 / 无价值内容或违法国家有关法律法规的内容,可点击提交进行申诉,我们将尽快为您处理。