代码拉取完成,页面将自动刷新
#pragma once
#include <string>
#include <fstream>
#include <boost/algorithm/string.hpp>
#include "SearcherLog.hpp"
#include "./cppjieba/Jieba.hpp"
namespace ns_util
{
enum Status
{
LABLE,
CONTENT
};
class HtmlFileUtil
{
public:
static bool readHtmlFile(const std::string &file_path, std::string *result)
{
// 打开file_path路径的文件
std::ifstream in(file_path, std::ios::in);
if (!in.is_open())
{
searcherLog(DEBUG, "open file error");
return false;
}
// 逐行读取文件内容,保存到result中
std::string line;
while (std::getline(in, line))
{
(*result) += line;
}
in.close();
return true;
}
static bool parseTitle(const std::string &file, std::string *file_title)
{
int begin = file.find("<title>");
if (begin == std::string::npos)
{
return false;
}
int end = file.find("</title>");
if (end == std::string::npos)
{
return false;
}
begin += std::string("<title>").size();
if (begin > end)
{
return false;
}
(*file_title) = file.substr(begin, end - begin);
return true;
}
static bool parseContent(const std::string &file, std::string *file_content)
{
Status status = LABLE;
for (char ch : file)
{
switch (status)
{
case LABLE:
if (ch == '>')
{
status = CONTENT;
}
break;
case CONTENT:
if (ch == '<')
{
status = LABLE;
}
else
{
// 消除正文中的换行符
if (ch == '\n')
{
ch = ' ';
}
file_content->push_back(ch);
}
break;
default:
break;
}
}
return true;
}
static bool parseUrl(const std::string &file_path, std::string *file_url, const std::string& raw_path)
{
std::string url_head = "https://www.boost.org/doc/libs/1_81_0/doc/html";
std::string url_tail = file_path.substr(raw_path.size());
*file_url = url_head + url_tail;
return true;
}
};
class StringUtil
{
public:
static void splitSrting(const std::string& file, std::vector<std::string>* out, const std::string& sep)
{
boost::split(*out, file, boost::is_any_of(sep), boost::token_compress_on);
}
};
// jieba词库
const char *const DICT_PATH = "./dict/jieba.dict.utf8";
const char *const HMM_PATH = "./dict/hmm_model.utf8";
const char *const USER_DICT_PATH = "./dict/user.dict.utf8";
const char *const IDF_PATH = "./dict/idf.utf8";
const char *const STOP_WORD_PATH = "./dict/stop_words.utf8";
class JiebaUtil
{
public:
static void CutString(const std::string &src, std::vector<std::string> *out)
{
jieba.CutForSearch(src, *out);
}
private:
static cppjieba::Jieba jieba;
};
cppjieba::Jieba JiebaUtil::jieba(DICT_PATH, HMM_PATH, USER_DICT_PATH, IDF_PATH, STOP_WORD_PATH);
}
此处可能存在不合适展示的内容,页面不予展示。您可通过相关编辑功能自查并修改。
如您确认内容无涉及 不当用语 / 纯广告导流 / 暴力 / 低俗色情 / 侵权 / 盗版 / 虚假 / 无价值内容或违法国家有关法律法规的内容,可点击提交进行申诉,我们将尽快为您处理。