代码拉取完成,页面将自动刷新
#include<iostream>
#include<vector>
#include<string>
#include <boost/filesystem.hpp>
#include "tool.hpp"
#include "log.hpp"
using namespace std;
const string src_path = "data/input";
const string raw = "data/raw_html/raw.txt";
typedef struct format
{
string title;//标题
string content;//内容
string url;//url
}Format;
bool Readfile(const string &src_path,vector<string> *files_gather)
{
boost::filesystem::path file_path(src_path);
if(!boost::filesystem::exists(file_path))//判断stc_path路径是否不存在
{
cerr<<"src_path is does not exist"<<endl;
return false;
}
boost::filesystem::recursive_directory_iterator end; //空迭代器,标志结束
//boost::filesystem::directory_iterator 用于迭代指定目录的直接内容,不会递归遍历子目录
//boost::filesystem::recursive_directory_iterator 用于递归遍历目录及其子目录的内容
for(boost::filesystem::recursive_directory_iterator iter(file_path);iter!=end;iter++)//遍历
{
if(!boost::filesystem::is_regular_file(*iter))//我们需要后缀.html并且是普通文件
{
continue;
}
if(iter->path().extension()!=".html")
{
continue;
}
//cout<<*iter<<endl;
//cout<<iter->path().string()<<endl;//debug
// count++;
files_gather->push_back(iter->path().string());
}
//cout<<count<<endl;
return true;
}
static bool partitle(const string &result,string *title)
{
size_t begin = result.find("<title>");
if(begin == string::npos)
{
return false;
}
size_t end = result.find("</title>");
if(end == string::npos)
{
return false;
}
begin += string("<title>").size();
if(begin>end)
{
return false;
}
*title = result.substr(begin,end-begin);
return true;
}
static bool parcontent(const string &result,string *content)
{
enum state
{
Label,
Content
};
state a =Label;
for(char c : result)
{
switch (a)
{
case Label:
if(c == '>')
a =Content;
break;
case Content:
if(c=='<')
a=Label;
else
{
if(c =='\n') c=' ';
content->push_back(c);
}
break;
default:
break;
}
}
return true;
}
static void ShowDoc(const Format &doc)
{
cout << "title: " << doc.title << endl;
cout << "content: " << doc.content << endl;
cout << "url: " << doc.url << endl;
}
static bool parturl(const string &file,string *url)
{
//string url_head = "https://www.boost.org/doc/libs/1_86_0/libs/config/doc/html/";
//string url_head = "https://www.boost.org/doc/libs/1_86_0/doc/html/";
string url_head = "https://www.boost.org/doc/libs/1_78_0/doc/html";
string url_tail = file.substr(src_path.size());
*url =(url_head+url_tail);
return true;
}
bool Anafile(vector<string> &files_gather,vector<Format> *outcome)
{
//int k =2;
for(string &file : files_gather)
{
string result;//读取文件内容
if(!project_tool::Filetool::divestfile(file,&result))
{
continue;
}
Format temp;
if(!partitle(result,&temp.title))//读取文档标题
{
continue;
}
if(!parcontent(result,&temp.content))//去标签
{
continue;
}
if(!parturl(file,&temp.url))
{
continue;
}
outcome->push_back(move(temp));//性能提升
}
return true;
}
bool SaveHtml(vector<Format> &outcome,const string &raw)
{
const char c = '\3';
ofstream out(raw, ios::out | ios::binary);
if(!out.is_open()){
cerr << "open " << raw << " failed!" << endl;
return false;
}
for(Format &item : outcome){
string temp_out;
temp_out = item.title;
temp_out += c;
temp_out += item.content;
temp_out += c;
temp_out += item.url;
temp_out += '\n';
out.write(temp_out.c_str(), temp_out.size());
if (out.fail()) {
std::cerr << "Error occurred while writing to the file." << std::endl;
return 1;
}
}
out.close();
return true;
}
int main()
{
vector<string> files_gather;
//1.读取html文件的路径保存到files_gather,用于后续分析
if(!Readfile(src_path,&files_gather))
{
cerr<<"Readfile is error"<<endl;
return 1;
}
//2.分析读取后的文件,结果放到outcome
vector<Format> outcome;
if(!Anafile(files_gather,&outcome))
{
cerr<<"Anafile is error"<<endl;
return 2;
}
//3.解析完的结果放到raw,用\3分隔
if(!SaveHtml(outcome,raw))
{
cerr<<"SaveHtml is error"<<endl;
return 3;
}
return 0;
}
此处可能存在不合适展示的内容,页面不予展示。您可通过相关编辑功能自查并修改。
如您确认内容无涉及 不当用语 / 纯广告导流 / 暴力 / 低俗色情 / 侵权 / 盗版 / 虚假 / 无价值内容或违法国家有关法律法规的内容,可点击提交进行申诉,我们将尽快为您处理。