当前位置：首页 > news >正文

Boost搜索引擎：项目整体代码及布局

news 来源：原创 2024/9/20 5:57:56

项目在xshell上面的目录

例如：dict -> /home/dz/...

这是建立的动态链接方便编译器查找你也可将所安装的开发工具放到当前目录下，最终还是要在你代码中包含，所以你只需在你代码中科院指明要包含开发工具的头文件等就可以了，不是非要构建动态链接。

动态链接的构建

ln -s /home/dz/Project/Hpp/cppjieba/dict/ ./dict要链接的 头文件 或 目录 所在路径      目标路径 dict就是动态链接库最好是绝对路径

动态链接的删除

unlink dict

rm 动态链接名也可以删除动态链接，最好用unlink 动态链接名称

---------------------------------------------------------------------------------------------------------------------------------

以下代码就是Boost搜索引擎整体代码，代码实现细节Boost搜索引擎项目专栏中都有列举，请先参考代码实现细节再来整体观看项目代码，个人感觉会好一些。

该项目中索引构建模块实现，运用很多容器，需要掌握不同容器的特性，以及容器中存储的结构体信息，只有了解到，容器与容器之间的关系，结构体之间的关系，才能彻底看懂这个项目。

项目工具模块util.hpp

#pragma once#include <iostream>
#include <vector>
#include <string>
#include <fstream>
#include <mutex>
#include <unordered_map>
#include <boost/algorithm/string.hpp>
#include "log.hpp"
#include "cppjieba/Jieba.hpp"namespace ns_util{class FileUtil{public:static bool ReadFile(const std::string &file_path, std::string *out){std::ifstream in(file_path, std::ios::in);if(!in.is_open()){std::cerr << "open file " << file_path << " error" << std::endl;return false;}std::string line;while(std::getline(in, line)){ //如何理解getline读取到文件结束呢？？getline的返回值是一个&，while(bool), 本质是因为重载了强制类型转化*out += line;}in.close();return true;}};class StringUtil{public:static void Split(const std::string &target, std::vector<std::string> *out, const std::string &sep){//boost splitboost::split(*out, target, boost::is_any_of(sep), boost::token_compress_on);}};const char* const DICT_PATH = "./dict/jieba.dict.utf8";const char* const HMM_PATH = "./dict/hmm_model.utf8";const char* const USER_DICT_PATH = "./dict/user.dict.utf8";const char* const IDF_PATH = "./dict/idf.utf8";const char* const STOP_WORD_PATH = "./dict/stop_words.utf8";class JiebaUtil{private://static cppjieba::Jieba jieba;cppjieba::Jieba jieba;std::unordered_map<std::string, bool> stop_words;private:JiebaUtil():jieba(DICT_PATH, HMM_PATH, USER_DICT_PATH, IDF_PATH, STOP_WORD_PATH){}JiebaUtil(const JiebaUtil&) = delete;static JiebaUtil *instance;public:static JiebaUtil* get_instance(){static std::mutex mtx;if(nullptr == instance){mtx.lock();if(nullptr == instance){instance = new JiebaUtil();instance->InitJiebaUtil();}mtx.unlock();}return instance;}void InitJiebaUtil(){std::ifstream in(STOP_WORD_PATH);if(!in.is_open()){LOG(FATAL, "load stop words file error");return;}std::string line;while(std::getline(in, line)){stop_words.insert({line, true});}in.close();}void CutStringHelper(const std::string &src, std::vector<std::string> *out){jieba.CutForSearch(src, *out);for(auto iter = out->begin(); iter != out->end(); ){auto it = stop_words.find(*iter);if(it != stop_words.end()){//说明当前的string 是暂停词，需要去掉iter = out->erase(iter);}else{iter++;}}}public:static void CutString(const std::string &src, std::vector<std::string> *out){ns_util::JiebaUtil::get_instance()->CutStringHelper(src, out);//jieba.CutForSearch(src, *out);}};JiebaUtil *JiebaUtil::instance = nullptr;//cppjieba::Jieba JiebaUtil::jieba(DICT_PATH, HMM_PATH, USER_DICT_PATH, IDF_PATH, STOP_WORD_PATH);
}

日志模块log.hpp

#pragma once#include <iostream>
#include <string>
#include <ctime>#define NORMAL  1
#define WARNING 2
#define DEBUG   3
#define FATAL   4#define LOG(LEVEL, MESSAGE) log(#LEVEL, MESSAGE, __FILE__, __LINE__)void log(std::string level, std::string message, std::string file, int line)
{std::cout << "[" << level << "]" << "[" << time(nullptr) << "]" << "[" << message << "]" << "[" << file << " : " << line << "]" << std::endl;
}

数据处理模块parser.cc

#include <iostream>
#include <string>
#include <vector>
#include <boost/filesystem.hpp>
#include "util.hpp"//是一个目录，下面放的是所有的html网页
const std::string src_path = "data/input";
const std::string output = "data/raw_html/raw.txt";typedef struct DocInfo{std::string title;   //文档的标题std::string content; //文档内容std::string url;     //该文档在官网中的url
}DocInfo_t;//const &: 输入
//*: 输出
//&：输入输出
bool EnumFile(const std::string &src_path, std::vector<std::string> *files_list);
bool ParseHtml(const std::vector<std::string> &files_list, std::vector<DocInfo_t> *results);
bool SaveHtml(const std::vector<DocInfo_t> &results, const std::string &output);int main()
{std::vector<std::string> files_list;//第一步: 递归式的把每个html文件名带路径，保存到files_list中，方便后期进行一个一个的文件进行读取if(!EnumFile(src_path, &files_list)){std::cerr << "enum file name error!" << std::endl;return 1;}//第二步: 按照files_list读取每个文件的内容，并进行解析std::vector<DocInfo_t> results;if(!ParseHtml(files_list, &results)){std::cerr << "parse html error" << std::endl;return 2;}//第三步: 把解析完毕的各个文件内容，写入到output,按照\3作为每个文档的分割符if(!SaveHtml(results, output)){std::cerr << "sava html error" << std::endl;return 3;}return 0;
}bool EnumFile(const std::string &src_path, std::vector<std::string> *files_list)
{namespace fs = boost::filesystem;fs::path root_path(src_path);//判断路径是否存在，不存在，就没有必要再往后走了if(!fs::exists(root_path)){std::cerr << src_path << " not exists" << std::endl;return false;}//定义一个空的迭代器，用来进行判断递归结束fs::recursive_directory_iterator end;for(fs::recursive_directory_iterator iter(root_path); iter != end; iter++){//判断文件是否是普通文件，html都是普通文件if(!fs::is_regular_file(*iter)){ continue;}if(iter->path().extension() != ".html"){ //判断文件路径名的后缀是否符合要求continue;}//std::cout << "debug: " << iter->path().string() << std::endl;//当前的路径一定是一个合法的，以.html结束的普通网页文件files_list->push_back(iter->path().string()); //将所有带路径的html保存在files_list,方便后续进行文本分析}return true;
}static bool ParseTitle(const std::string &file, std::string *title)
{std::size_t begin = file.find("<title>");if(begin == std::string::npos){return false;}std::size_t end = file.find("</title>");if(end == std::string::npos){return false;}begin += std::string("<title>").size();if(begin > end){return false;}*title = file.substr(begin, end - begin);return true;
}static bool ParseContent(const std::string &file, std::string *content)
{//去标签,基于一个简易的状态机enum status{LABLE,CONTENT};enum status s = LABLE;for( char c : file){switch(s){case LABLE:if(c == '>') s = CONTENT;break;case CONTENT:if(c == '<') s = LABLE;else {//我们不想保留原始文件中的\n,因为我们想用\n作为html解析之后文本的分隔符if(c == '\n') c = ' ';content->push_back(c);}break;default:break;}}return true;
}static bool ParseUrl(const std::string &file_path, std::string *url)
{std::string url_head = "https://www.boost.org/doc/libs/1_78_0/doc/html";std::string url_tail = file_path.substr(src_path.size());*url = url_head + url_tail;return true;
}//for debug
static void ShowDoc( const DocInfo_t &doc)
{std::cout << "title: " << doc.title << std::endl;std::cout << "content: " << doc.content << std::endl;std::cout << "url: " << doc.url << std::endl;
}bool ParseHtml(const std::vector<std::string> &files_list, std::vector<DocInfo_t> *results)
{for(const std::string &file : files_list){//1. 读取文件，Read();std::string result;if(!ns_util::FileUtil::ReadFile(file, &result)){continue;}DocInfo_t doc;//2. 解析指定的文件，提取titleif(!ParseTitle(result, &doc.title)){continue;}//3. 解析指定的文件，提取content,就是去标签if(!ParseContent(result, &doc.content)){continue;}//4. 解析指定的文件路径，构建urlif(!ParseUrl(file, &doc.url)){continue;}//done,一定是完成了解析任务，当前文档的相关结果都保存在了doc里面results->push_back(std::move(doc)); //bug:todo;细节，本质会发生拷贝，效率可能会比较低//for debug//ShowDoc(doc);//break;}return true;
}bool SaveHtml(const std::vector<DocInfo_t> &results, const std::string &output)
{
#define SEP '\3'//按照二进制方式进行写入std::ofstream out(output, std::ios::out | std::ios::binary);if(!out.is_open()){std::cerr << "open " << output << " failed!" << std::endl;return false;}//就可以进行文件内容的写入了for(auto &item : results){std::string out_string;out_string = item.title;out_string += SEP;out_string += item.content;out_string += SEP;out_string += item.url;out_string += '\n';out.write(out_string.c_str(), out_string.size());}out.close();return true;
}

索引构建模块index.hpp

#pragma once#include <iostream>
#include <string>
#include <vector>
#include <fstream>
#include <unordered_map>
#include <mutex>
#include "util.hpp"
#include "log.hpp"namespace ns_index{struct DocInfo{std::string title;   //文档的标题std::string content; //文档对应的去标签之后的内容std::string url;     //官网文档urluint64_t doc_id;          //文档的ID，暂时先不做过多理解};struct InvertedElem{uint64_t doc_id;std::string word;int weight;InvertedElem():weight(0){}};//倒排拉链typedef std::vector<InvertedElem> InvertedList;class Index{private://正排索引的数据结构用数组，数组的下标天然是文档的IDstd::vector<DocInfo> forward_index; //正排索引//倒排索引一定是一个关键字和一组(个)InvertedElem对应[关键字和倒排拉链的映射关系]std::unordered_map<std::string, InvertedList> inverted_index;private:Index(){} //但是一定要有函数体，不能deleteIndex(const Index&) = delete;Index& operator=(const Index&) = delete;static Index* instance;static std::mutex mtx;public:~Index(){}public:static Index* GetInstance(){//if(nullptr == instance){//mtx.lock();if(nullptr == instance){instance = new Index();}//mtx.unlock();//}return instance;}//根据doc_id找到找到文档内容DocInfo *GetForwardIndex(uint64_t doc_id){if(doc_id >= forward_index.size()){std::cerr << "doc_id out range, error!" << std::endl;return nullptr;}return &forward_index[doc_id];}//根据关键字string，获得倒排拉链InvertedList *GetInvertedList(const std::string &word){auto iter = inverted_index.find(word);if(iter == inverted_index.end()){std::cerr << word << " have no InvertedList" << std::endl;return nullptr;}return &(iter->second);}//根据去标签，格式化之后的文档，构建正排和倒排索引//data/raw_html/raw.txtbool BuildIndex(const std::string &input) //parse处理完毕的数据交给我{std::ifstream in(input, std::ios::in | std::ios::binary);if(!in.is_open()){std::cerr << "sorry, " << input << " open error" << std::endl;return false;}std::string line;int count = 0;while(std::getline(in, line)){DocInfo * doc = BuildForwardIndex(line);if(nullptr == doc){std::cerr << "build " << line << " error" << std::endl; //for deubgcontinue;}BuildInvertedIndex(*doc);count++;//if(count % 50 == 0){//std::cout <<"当前已经建立的索引文档: " << count <<std::endl;LOG(NORMAL, "当前的已经建立的索引文档: " + std::to_string(count));//}}return true;}private:DocInfo *BuildForwardIndex(const std::string &line){//1. 解析line，字符串切分//line -> 3 string, title, content, urlstd::vector<std::string> results;const std::string sep = "\3";   //行内分隔符ns_util::StringUtil::Split(line, &results, sep);//ns_util::StringUtil::CutString(line, &results, sep);if(results.size() != 3){return nullptr;}//2. 字符串进行填充到DocIinfoDocInfo doc;doc.title = results[0]; //titledoc.content = results[1]; //contentdoc.url = results[2];   ///urldoc.doc_id = forward_index.size(); //先进行保存id，在插入，对应的id就是当前doc在vector中的下标!//3. 插入到正排索引的vectorforward_index.push_back(std::move(doc)); //doc,html文件内容return &forward_index.back();}bool BuildInvertedIndex(const DocInfo &doc){//DocInfo{title, content, url, doc_id}//word -> 倒排拉链struct word_cnt{int title_cnt;int content_cnt;word_cnt():title_cnt(0), content_cnt(0){}};std::unordered_map<std::string, word_cnt> word_map; //用来暂存词频的映射表//对标题进行分词std::vector<std::string> title_words;ns_util::JiebaUtil::CutString(doc.title, &title_words);//if(doc.doc_id == 1572){//    for(auto &s : title_words){//        std::cout << "title: " << s << std::endl;//    }//}//对标题进行词频统计for(std::string s : title_words){boost::to_lower(s); //需要统一转化成为小写word_map[s].title_cnt++; //如果存在就获取，如果不存在就新建}//对文档内容进行分词std::vector<std::string> content_words;ns_util::JiebaUtil::CutString(doc.content, &content_words);//if(doc.doc_id == 1572){//    for(auto &s : content_words){//        std::cout << "content: " << s << std::endl;//    }//}//对内容进行词频统计for(std::string s : content_words){boost::to_lower(s);word_map[s].content_cnt++;}#define X 10
#define Y 1//Hello,hello,HELLOfor(auto &word_pair : word_map){InvertedElem item;item.doc_id = doc.doc_id;item.word = word_pair.first;item.weight = X*word_pair.second.title_cnt + Y*word_pair.second.content_cnt; //相关性InvertedList &inverted_list = inverted_index[word_pair.first];inverted_list.push_back(std::move(item));}return true;}};Index* Index::instance = nullptr;std::mutex Index::mtx;
}

关键词搜索模块searcher.hpp

#pragma once#include "index.hpp"
#include "util.hpp"
#include "log.hpp"
#include <algorithm>
#include <unordered_map>
#include <jsoncpp/json/json.h>namespace ns_searcher{struct InvertedElemPrint{uint64_t doc_id;int weight;std::vector<std::string> words;InvertedElemPrint():doc_id(0), weight(0){}};class Searcher{private:ns_index::Index *index; //供系统进行查找的索引public:Searcher(){}~Searcher(){}public:void InitSearcher(const std::string &input){//1. 获取或者创建index对象index = ns_index::Index::GetInstance();//std::cout << "获取index单例成功..." << std::endl;LOG(NORMAL, "获取index单例成功...");//2. 根据index对象建立索引index->BuildIndex(input);//std::cout << "建立正排和倒排索引成功..." << std::endl;LOG(NORMAL, "建立正排和倒排索引成功...");}//query: 搜索关键字//json_string: 返回给用户浏览器的搜索结果void Search(const std::string &query, std::string *json_string){//1.[分词]:对我们的query进行按照searcher的要求进行分词std::vector<std::string> words;ns_util::JiebaUtil::CutString(query, &words);//2.[触发]:就是根据分词的各个"词"，进行index查找,建立index是忽略大小写，所以搜索，关键字也需要//ns_index::InvertedList inverted_list_all; //内部InvertedElemstd::vector<InvertedElemPrint> inverted_list_all;std::unordered_map<uint64_t, InvertedElemPrint> tokens_map;for(std::string word : words){boost::to_lower(word);ns_index::InvertedList *inverted_list = index->GetInvertedList(word);if(nullptr == inverted_list){continue;}//不完美的地方：暂时可以交给大家 , 你/是/一个/好人 100//inverted_list_all.insert(inverted_list_all.end(), inverted_list->begin(), inverted_list->end());for(const auto &elem : *inverted_list){auto &item = tokens_map[elem.doc_id]; //[]:如果存在直接获取，如果不存在新建//item一定是doc_id相同的print节点item.doc_id = elem.doc_id;item.weight += elem.weight;item.words.push_back(elem.word);}}for(const auto &item : tokens_map){inverted_list_all.push_back(std::move(item.second));}//3.[合并排序]：汇总查找结果，按照相关性(weight)降序排序//std::sort(inverted_list_all.begin(), inverted_list_all.end(),\//      [](const ns_index::InvertedElem &e1, const ns_index::InvertedElem &e2){//        return e1.weight > e2.weight;//        });std::sort(inverted_list_all.begin(), inverted_list_all.end(),\[](const InvertedElemPrint &e1, const InvertedElemPrint &e2){return e1.weight > e2.weight;});//4.[构建]:根据查找出来的结果，构建json串 -- jsoncpp --通过jsoncpp完成序列化&&反序列化Json::Value root;for(auto &item : inverted_list_all){ns_index::DocInfo * doc = index->GetForwardIndex(item.doc_id);if(nullptr == doc){continue;}Json::Value elem;elem["title"] = doc->title;elem["desc"] = GetDesc(doc->content, item.words[0]); //content是文档的去标签的结果，但是不是我们想要的，我们要的是一部分 TODOelem["url"]  = doc->url;//for deubg, for deleteelem["id"] = (int)item.doc_id;elem["weight"] = item.weight; //int->stringroot.append(elem);}//Json::StyledWriter writer;Json::FastWriter writer;*json_string = writer.write(root);}std::string GetDesc(const std::string &html_content, const std::string &word){//找到word在html_content中的首次出现，然后往前找50字节(如果没有，从begin开始)，往后找100字节(如果没有，到end就可以的)//截取出这部分内容const int prev_step = 50;const int next_step = 100;//1. 找到首次出现auto iter = std::search(html_content.begin(), html_content.end(), word.begin(), word.end(), [](int x, int y){return (std::tolower(x) == std::tolower(y));});if(iter == html_content.end()){return "None1";}int pos = std::distance(html_content.begin(), iter);//2. 获取start，end , std::size_t 无符号整数int start = 0; int end = html_content.size() - 1;//如果之前有50+字符，就更新开始位置if(pos > start + prev_step) start = pos - prev_step;if(pos < end - next_step) end = pos + next_step;//3. 截取子串,returnif(start >= end) return "None2";std::string desc = html_content.substr(start, end - start);desc += "...";return desc;}};
}

http网络模块http_server.cc

#include "cpp-httplib"
#include "searcher.hpp"const std::string input = "data/raw_html/raw.txt";
const std::string root_path = "./wwwroot";int main()
{ns_searcher::Searcher search;search.InitSearcher(input);httplib::Server svr;svr.set_base_dir(root_path.c_str());svr.Get("/s", [&search](const httplib::Request &req, httplib::Response &rsp){if(!req.has_param("word")){rsp.set_content("必须要有搜索关键字!", "text/plain; charset=utf-8");return;}std::string word = req.get_param_value("word");//std::cout << "用户在搜索：" << word << std::endl;LOG(NORMAL, "用户搜索的: " + word);std::string json_string;search.Search(word, &json_string);rsp.set_content(json_string, "application/json");//rsp.set_content("你好,世界!", "text/plain; charset=utf-8");});LOG(NORMAL, "服务器启动成功...");svr.listen("0.0.0.0", 8081);return 0;
}

makefile模块编写

PARSER=parser
DUG=debug
HTTP_SERVER=http_server
cc=g++.PHONY:all
all:$(PARSER) $(DUG) $(HTTP_SERVER)$(PARSER):parser.cc$(cc) -o $@ $^ -lboost_system -lboost_filesystem -std=c++11
$(DUG):debug.cc$(cc) -o $@ $^ -ljsoncpp -std=c++11
$(HTTP_SERVER):http_server.cc$(cc) -o $@ $^ -ljsoncpp -lpthread -std=c++11
.PHONY:clean
clean:rm -f $(PARSER) $(DUG) $(HTTP_SERVER)