1 Star 0 Fork 0

JiajiaPig/boost搜索引擎

加入 Gitee
与超过 1200万 开发者一起发现、参与优秀开源项目,私有仓库也完全免费 :)
免费加入
文件
该仓库未声明开源许可证文件(LICENSE),使用请关注具体项目描述及其代码上游依赖。
克隆/下载
util.hpp 3.71 KB
一键复制 编辑 原始数据 按行查看 历史
JiajiaPig 提交于 2023-08-16 20:45 . boost搜索引擎
#pragma once
#include <string>
#include <fstream>
#include <boost/algorithm/string.hpp>
#include "SearcherLog.hpp"
#include "./cppjieba/Jieba.hpp"
namespace ns_util
{
enum Status
{
LABLE,
CONTENT
};
class HtmlFileUtil
{
public:
static bool readHtmlFile(const std::string &file_path, std::string *result)
{
// 打开file_path路径的文件
std::ifstream in(file_path, std::ios::in);
if (!in.is_open())
{
searcherLog(DEBUG, "open file error");
return false;
}
// 逐行读取文件内容,保存到result中
std::string line;
while (std::getline(in, line))
{
(*result) += line;
}
in.close();
return true;
}
static bool parseTitle(const std::string &file, std::string *file_title)
{
int begin = file.find("<title>");
if (begin == std::string::npos)
{
return false;
}
int end = file.find("</title>");
if (end == std::string::npos)
{
return false;
}
begin += std::string("<title>").size();
if (begin > end)
{
return false;
}
(*file_title) = file.substr(begin, end - begin);
return true;
}
static bool parseContent(const std::string &file, std::string *file_content)
{
Status status = LABLE;
for (char ch : file)
{
switch (status)
{
case LABLE:
if (ch == '>')
{
status = CONTENT;
}
break;
case CONTENT:
if (ch == '<')
{
status = LABLE;
}
else
{
// 消除正文中的换行符
if (ch == '\n')
{
ch = ' ';
}
file_content->push_back(ch);
}
break;
default:
break;
}
}
return true;
}
static bool parseUrl(const std::string &file_path, std::string *file_url, const std::string& raw_path)
{
std::string url_head = "https://www.boost.org/doc/libs/1_81_0/doc/html";
std::string url_tail = file_path.substr(raw_path.size());
*file_url = url_head + url_tail;
return true;
}
};
class StringUtil
{
public:
static void splitSrting(const std::string& file, std::vector<std::string>* out, const std::string& sep)
{
boost::split(*out, file, boost::is_any_of(sep), boost::token_compress_on);
}
};
// jieba词库
const char *const DICT_PATH = "./dict/jieba.dict.utf8";
const char *const HMM_PATH = "./dict/hmm_model.utf8";
const char *const USER_DICT_PATH = "./dict/user.dict.utf8";
const char *const IDF_PATH = "./dict/idf.utf8";
const char *const STOP_WORD_PATH = "./dict/stop_words.utf8";
class JiebaUtil
{
public:
static void CutString(const std::string &src, std::vector<std::string> *out)
{
jieba.CutForSearch(src, *out);
}
private:
static cppjieba::Jieba jieba;
};
cppjieba::Jieba JiebaUtil::jieba(DICT_PATH, HMM_PATH, USER_DICT_PATH, IDF_PATH, STOP_WORD_PATH);
}
Loading...
马建仓 AI 助手
尝试更多
代码解读
代码找茬
代码优化
1
https://gitee.com/jiajiapig/boost-search-engine.git
git@gitee.com:jiajiapig/boost-search-engine.git
jiajiapig
boost-search-engine
boost搜索引擎
master

搜索帮助