master

分支 (1)

管理

管理

master

boost_search
/
searcher.hpp

#pragma once
#include "index.hpp"
#include "util.hpp"
#include "log.hpp"
#include <algorithm>
#include <jsoncpp/json/json.h>

namespace ns_searcher{

    struct InvertedElemPrint{
        uint64_t doc_id;
        int weight;
        std::vector<std::string> words;
        InvertedElemPrint():doc_id(0),weight(0){}
    };

    class Searcher{
        private:
            ns_index::Index *index; //供系统进行查找的索引   单例
        public:
            Searcher(){}
            ~Searcher(){}
        public:
            void InitSearch(const std::string &input)
            {
                //1. 获取或者创建index对象
                index = ns_index::Index::GetInstance();
                //std::cout<<"获取index单例成功..."<<std::endl;
                LOG(NORMAL,"获取单例成功...");
                //2. 根据index对象建立索引
                index->BuildIndex(input);
                //std::cout<<"建立正排和倒排索引成功..."<<std::endl;
                LOG(NORMAL,"建立正排和倒排索引成功...");
            }

            //query:搜索关键字
            //json_string:返回给用户浏览器的数据/搜索结果
            void Search(const std::string &query,std::string *json_string)
            {
                //1.[分词]:对我们的query进行按照searcher的要求进行分词
                std::vector<std::string> words;
                ns_util::JiebaUtil::CutString(query,&words);
                //2.[触发]:就是根据分词的各个'词'，进行index查找，建立index是忽略大小写了的，所以搜索也要
                //ns_index::InvertedList inverted_list_all;//内部是InvertedElem
                std::vector<InvertedElemPrint> inverted_list_all;//只用来保存不重复的倒排拉链节点

                std::unordered_map<uint16_t,InvertedElemPrint> tokens_map;

                for(std::string word : words){
                    boost::to_lower(word);

                    ns_index::InvertedList *inverted_list = index->GetInwertedList(word);
                    if(nullptr == inverted_list){
                        continue;
                    }
                    //比如 你是一个好人 -> 你/是/一个/好人 原来是搜出四条一模一样的 现在去重了
                    //inverted_list_all.insert(inverted_list_all.end(),inverted_list->begin(),inverted_list->end());
                    for(const auto &elem:*inverted_list){
                        auto &item = tokens_map[elem.doc_id];//[]:如果存在直接获取，不存在就新建
                        //item一定是一doc_id相同的print节点
                        item.doc_id = elem.doc_id;
                        item.weight += elem.weight;
                        item.words.push_back(elem.word);
                    }
                }
                for(const auto &item:tokens_map){
                    inverted_list_all.push_back(std::move(item.second));
                }

                //3.[合并排序]:汇总查找结果，按照相关性(weight)降序排序
                // std::sort(inverted_list_all.begin(),inverted_list_all.end(),[](
                //     const ns_index::InvertedElem &e1,const ns_index::InvertedElem &e2){
                //     return e1.weight > e2.weight;
                // });
                    std::sort(inverted_list_all.begin(),inverted_list_all.end(),[](
                        const InvertedElemPrint &e1,const InvertedElemPrint &e2){
                        return e1.weight > e2.weight;
                    });
                //4.[构建]:根据查找出来的结果，构建json串 -- jsoncpp -- 通过jsoncpp完成序列化和反序列化
                Json::Value root;
                for(auto &item : inverted_list_all){
                    ns_index::DocInfo *doc = index->GetForwardIndex(item.doc_id);
                    if(nullptr == doc){
                        continue;
                    }
                    Json::Value elem;
                    elem["title"] = doc->title;
                    elem["desc"] = GetDesc(doc->content, item.words[0]);//content是文档去标签的结果，我们要的是摘要
                    elem["url"] = doc->url;
                    //for debug , for delete
                    //elem["id"] = (int)item.doc_id;
                    //elem["weight"] = item.weight; //int -> string

                    root.append(elem);
                }

                //Json::StyledWriter writer;
                Json::FastWriter writer;
                *json_string = writer.write(root);//将我们的root进行序列化
            }

            //获取摘要
            std::string GetDesc(const std::string &html_content, const std::string &word)
            {
                //找到word在html_content中的首次出现，然后往前找50byte（begin），往后100byte（end）
                const int prev_step = 50;
                const int next_step = 100;
                //1. 找到首次出现
                auto iter = std::search(html_content.begin(), html_content.end(), word.begin(), word.end(), [](int x, int y){
                    return (std::tolower(x) == std::tolower(y));
                });
                if(iter == html_content.end()){
                    return "None1";
                }
                //自带的distance
                int pos = std::distance(html_content.begin(), iter);
                // std::size_t pos = html_content.find(word);
                // if(pos == std::string::npos){
                //     return "None";//这种情况是不存在的
                // }
                //debug 发现有坑   搜索的时候，转小写没问题，因为要根据索引去查找，但是当去
                //获得摘要的时候，拿到的是文档内容（我们并不去改变文档），那么就可能匹配不上
                //（因为之前保存的是to_lower的word 但是原文不一定是小写的）

                //2. 获取start，end
                int start = 0;
                int end = html_content.size()-1;
                //如果之前有50+byte，就更新开始位置
                if(pos > start + prev_step) start = pos;
                if(pos + next_step < end) end = pos + next_step;
                //本来都是size_t的，但是因为减的化有坑，换成了int
                //if((int)pos <(int)(end - next_step)) end = pos + next_step

                //3. 截取子串，return
                if(start >= end) return "None2";
                return html_content.substr(start, end - start);
                // std::string desc = html_content.substr(start, end - start);
                // desc += "...";
                // return desc;
            }

    };
}