SearchEngineOfBoostLibrary
/
searcher.hpp

#pragma  once
#include "index.hpp"
#include "tool.hpp"
#include <algorithm>
#include <iostream>
//#include <boost/algorithm/string.hpp>
#include "jsoncpp/json/json.h"
#include "log.hpp"

namespace ns_searcher
{
    struct InvertedNodePrint
    {
        uint64_t file_id;
        int weight;
        std::vector<std::string> words;
        InvertedNodePrint()
            :file_id(0)
            ,weight(0)
        {}
    };
    class Searcher
    {
    public:
        Searcher(){}
        ~Searcher(){}
    public:
        void InitSearcher(const std::string& raw_txt)
        {
            //1.获取或创建index对象
            index=ns_index::Index::GetInstance();
            //std::cout<<"获取index单例成功..."<<std::endl;
            LOG(NORMAL,"获取index单例成功...");
            //2.根据index对象建立索引
            index->BuildIndex(raw_txt);
            //std::cout<<"建立索引成功..."<<std::endl;
            LOG(NORMAL,"建立索引成功...");
        }

        //aim_str:搜索词
        //json_str:返回给用户浏览器的搜索结果
        void Search(const std::string& aim_str,std::string* json_str)
        {
            //1.先根据用户的搜索词进行分词
            std::vector<std::string> words;
            ns_tool::JiebaTool::CutString(aim_str,&words);

            //2.根据分词后个各个“关键词”，进行index查找—触发
            //一个关键词一个拉链，把这些拉链保存在一起，合成一个拉链
            //ns_index::InvertList combin_list;
            std::unordered_map<uint64_t,InvertedNodePrint> take_map;//用来根据file_id去重，重复的Node都放这里的NodePrint节点
            std::vector<InvertedNodePrint> combin_list;//用打印节点构建的新的组合拉链

            for(std::string word:words)
            {
                boost::to_lower(word);

                //关键字->倒排拉链
                ns_index::InvertList* inver_list=index->GetInvertList(word);
                if(nullptr==inver_list)//没有倒排索引，该关键字没有在文件中出现过
                {
                    continue;//检测下一个词
                }

                //获得了这个关键词和文档id的关系，保存起来
                //combin_list.insert(combin_list.end(),inver_list->begin(),inver_list->end());

                //对建立好的倒排拉链去重
                for(auto& node:*inver_list)
                {
                    auto& it=take_map[node.file_id];//有就获取，没有就创建
                    //这里获取的文件一定都有相同的file_id
                    it.file_id=node.file_id;
                    it.weight+=node.weight;//累加权值
                    it.words.push_back(node.key_word);
                }
            }
            //把去重完成的map插入到组合拉链
            for(const auto& it:take_map)
            {
                combin_list.push_back(it.second);
            }

            // //3.汇总查找结果，按照相关性进行降序排列—合并排序
            // std::sort(combin_list.begin(),combin_list.end(),\
            //     [](const ns_Index::InvertNode& n1,const ns_Index::InvertNode& n2){
            //     return n1.weight>n2.weight;
            //     });
            //3.汇总查找结果，按照相关性进行降序排列—合并排序
            std::sort(combin_list.begin(),combin_list.end(),\
                [](const InvertedNodePrint& n1,const InvertedNodePrint& n2){
                return n1.weight>n2.weight;
                });

            //4.根据查找出来的结果，构建json串—构建（需要用到jsoncpp）
            Json::Value root;
            for(InvertedNodePrint& it : combin_list)
            {
                //遍历组合拉链的所有节点，根据文件id和正排索引，得到文件内容
                ns_index::Format_t* fmt=index->GetForwardIndex(it.file_id);
                if(nullptr==fmt)
                    continue;

                //将想要给浏览器返回的内容append到json串中
                Json::Value item;
                item["title"]=fmt->title;
                item["describe"]=GetDes(fmt->describe,it.words[0]);//这里我们不需要带全部内容，只要一部分
                item["url"]=fmt->url;
                //for debug
                item["id"]=(int)it.file_id;
                item["weight"]=it.weight;
                root.append(item);
            }
            Json::FastWriter writer;
            *json_str=writer.write(root);
        }
        std::string GetDes(const std::string& content,const std::string& key_word)
        {
            //找到关键字在文档内容中的首次出现，然后往前找50字节，往后找150字节，截取出这部分
            //如果前面没有50个，就从begin开始，如果后面没有150，就截止到end
            const int prev=50;
            const int next=150;
            //1.找到首次出现
            //不能用find找C++的search接口可以忽略大小写查找
            auto it=std::search(content.begin(),content.end(),key_word.begin(),key_word.end(),[](int x,int y){
                return (std::tolower(x)==std::tolower(y));
            });
            if(it==content.end())
                return "None1";
            int pos=std::distance(content.begin(),it);

            //2.获取start位置和end位置
            int start=0;
            int end=content.size()-1;
            if(pos-prev > 0)
                start=pos-prev;
            if((pos+next) < content.size())
                end=pos+next;

            //3.截取字串返回
            if(start>=end) return "None2";//几乎不可能出现
            return content.substr(start,end-start);
        }
    private:
        ns_index::Index* index;//供系统查找索引
    };
}