Fetch the repository succeeded.
#pragma once
#include<iostream>
#include<string>
#include <unordered_map>
#include<fstream>
#include <boost/algorithm/string.hpp>
#include <mutex>
#include "ccjieba/Jieba.hpp"
#include "log.hpp"
namespace ns_util{
class FileUtil{
public:
static bool ReadFile(const std::string &file_path,std::string *out)
{
//输入流
std::ifstream in(file_path,std::ios::in/*表示读取*/);
if(!in.is_open()){
std::cerr<<"open file "<<file_path<<" error"<<std::endl;
return false;
}
std::string line;
while(std::getline(in,line)){
*out+=line;
//如何理解getline读取到文件结束呢??
//getline的返回值是一个&,while(bool), 本质是因为重载了强制类型转化。
//就是返回特定的引用对象当中
//即while判断这个对象结果是否合理的时候,
//对象的内容重载了强制类型转化,变成了bool值。
}
in.close();
return true;
}
};
class StringUtil{
public:
static void Split(const std::string &target,std::vector<std::string> *out,const std::string &sep)
{
//boost split
boost::split(*out, target, boost::is_any_of(sep), boost::token_compress_on);
//第一个参数就是切分的结果,第二个是数据源,第三个是分隔符,第四个是分隔符和分隔符之间是否需要压
//缩(比如:aaa/3vv/3nn/3/3/3/3gggg/3)(boost::token_compress_on(默认是off)就是要不要
//把中间的\3压缩为一个就是这个意思,如果不加,就会有很多空的数据)
}
};
const char* const DICT_PATH = "./dict/jieba.dict.utf8";
const char* const HMM_PATH = "./dict/hmm_model.utf8";
const char* const USER_DICT_PATH = "./dict/user.dict.utf8";
const char* const IDF_PATH = "./dict/idf.utf8";
const char* const STOP_WORD_PATH = "./dict/stop_words.utf8";
class JiebaUtil{
private:
//static cppjieba::Jieba jieba;
cppjieba::Jieba jieba;
std::unordered_map<std::string,bool> stop_words;
private:
JiebaUtil():jieba(DICT_PATH, HMM_PATH, USER_DICT_PATH, IDF_PATH, STOP_WORD_PATH)
{}
JiebaUtil(const JiebaUtil&) = delete;
static JiebaUtil *instance;
public:
static JiebaUtil* get_instance()
{
static std::mutex mtx;
if(nullptr == instance){
mtx.lock();
if(nullptr == instance){
instance = new JiebaUtil();
instance->InitJiebaUtil();
}
mtx.unlock();
}
return instance;
}
//把暂停词加载进来
void InitJiebaUtil()
{
std::ifstream in(STOP_WORD_PATH);
if(!in.is_open()){
LOG(FATAL, "load stop words file error");
return;
}
std::string line;
while(std::getline(in, line)){
stop_words.insert({line, true});
}
in.close();
}
void CutStringHelper(const std::string &src, std::vector<std::string> *out)
{
jieba.CutForSearch(src, *out);
for(auto iter = out->begin(); iter != out->end(); ){
auto it = stop_words.find(*iter);
if(it != stop_words.end()){
//说明当前的string 是暂停词,需要去掉
iter = out->erase(iter);
}
else{
iter++;
}
}
}
public:
static void CutString(const std::string &src,std::vector<std::string> *out)
{
ns_util::JiebaUtil::get_instance()->CutStringHelper(src, out);
//jieba.CutForSearch(src,*out);
}
};
JiebaUtil *JiebaUtil::instance = nullptr;
//cppjieba::Jieba JiebaUtil::jieba(DICT_PATH, HMM_PATH, USER_DICT_PATH, IDF_PATH, STOP_WORD_PATH);//静态成员在类外定义
}
此处可能存在不合适展示的内容,页面不予展示。您可通过相关编辑功能自查并修改。
如您确认内容无涉及 不当用语 / 纯广告导流 / 暴力 / 低俗色情 / 侵权 / 盗版 / 虚假 / 无价值内容或违法国家有关法律法规的内容,可点击提交进行申诉,我们将尽快为您处理。