基于百度brpc框架的简易站内搜索引擎

编程入门行业动态更新时间:2024-10-08 02:17:42

基于百度brpc框架的简易<a href=https://www.elefans.com/category/jswz/34/1769787.html style= 站内搜索引擎"/>

基于百度brpc框架的简易站内搜索引擎

因为之前坐了一个网站，完成了简单的站内搜索的功能，但是效率太低，每次都要从数据库中提取数据出来。于是花了一点时间改善了这个站内搜索的功能，通过倒排索引和正排索引来实现搜索功能。

注意：本项目用到的库都需要自行安装（cppjieba，jsoncpp，brpc，gflags，protobuf，leveldb），后三个是brpc要求安装的

思路：

一.首先实现一个制作索引的线下程序

从数据库中先拿取到索引源，即所有的索引都是从这些数据中取出来的。因为我的项目是一个新闻系统，于是我就取出了对应新的ID，title，author，content以及date，所有的关键字来源于title，author，content。
通过cppjiaba分词库，对关键字来源进行分词，所有的分词结果都对应该新闻的id，所以这里用一个哈希表来存储（key是分词出来的关键词，value是这个关键词在具体哪个新闻出现的id）
因为同一个关键词可能在不同新闻中出现，所以上面的哈希表的value我设置了一个unordered_set，用一个集合来存放一个关键词出现在哪些新闻的集合
以上制作出来的哈希表就是一个倒排索引（通过关键词拿到新闻id）
正排索引很简单，也是用一个哈希表来存放（key是新闻id，value是新闻对应的信息，包括title，author，content和date），因为它的value也是一个键值对（title：xxx），所以这个哈希表的定义类型是这样的：unordered_map<stirng,unordered_map<string,string> >
接着通过jsoncpp将倒排索引和正排索引的数据写到文件中，制作成json数据格式（文件类型.json），我们可以看一下文件中的内容，是如何封装的

正排索引文件

倒排索引文件

下面是制作索引的程序代码

word_segmentation.hpp（分词头文件）

#pragma once
#include "/home/pigff/third-part-lib/cppjieba/include/cppjieba/Jieba.hpp"
#include <iostream>
#include <string>
#include <vector>
using std::cout;
using std::endl;
using std::string;
using std::vector;//最大概率法(MPSegment: Max Probability)分词所使用的词典路径
const char * const DICT_PATH = "/home/pigff/third-part-lib/cppjieba/dict/jieba.dict.utf8";
//隐式马尔科夫模型(HMMSegment: Hidden Markov Model)分词所使用的词典路径
const char * const HMM_PATH = "/home/pigff/third-part-lib/cppjieba/dict/hmm_model.utf8";
//用户自定义词典路径
const char * const USER_DICT_PATH = "/home/pigff/third-part-lib/cppjieba/dict/user.dict.utf8";
//IDF路径
const char* const IDF_PATH = "/home/pigff/third-part-lib/cppjieba/dict/idf.utf8";
//停用词路径
const char* const STOP_WORD_PATH = "/home/pigff/third-part-lib/cppjieba/dict/stop_words.utf8";class WordSegmentation//使用结巴分词库进行分词
{
public: WordSegmentation() :_jieba(DICT_PATH, HMM_PATH, USER_DICT_PATH,IDF_PATH,STOP_WORD_PATH){} vector<string> operator()(const string str){ //返回str的分词结果vector<string> words; _jieba.CutAll(str, words);//FullSegment return words; }
private:cppjieba::Jieba _jieba;
};

buind_index（制作索引程序）

#include "word_segmentation.hpp"
#include <mysql/mysql.h>
#include <unordered_map>
#include <unordered_set>
#include <unistd.h>
#include <fcntl.h>
#include <jsoncpp/json/value.h>
#include <jsoncpp/json/json.h>
#include <fstream>struct All_News_Info{vector<string> ids;vector<string> titles;vector<string> authors;vector<string> contents;vector<string> dates;
};
//从数据库拿取数据
//数据来源是新闻的标题和内容
void SearchData(All_News_Info& infos){MYSQL* conn;conn = mysql_init(NULL);//设置连接的字符集为utf8，不然没法显示中文mysql_set_character_set(conn,"utf8");if(conn == NULL)cout << "Error " << mysql_errno(conn) << ": " << mysql_error(conn);if(mysql_real_connect(conn,"localhost","root","1","news",0,NULL,0) == NULL)cout << "Error " << mysql_errno(conn) << ": " << mysql_error(conn);MYSQL_RES* result;MYSQL_ROW row;mysql_query(conn,"select id,title,author,content,createdate from news");result = mysql_store_result(conn);while((row = mysql_fetch_row(result))){infos.ids.push_back(row[0]);infos.titles.push_back(row[1]);infos.authors.push_back(row[2]);infos.contents.push_back(row[3]);infos.dates.push_back(row[4]);}
}//存储索引(正排索引和倒排索引)
//正排索引（文档id->文档的全部信息，用来查看文档中关键词的出现次数）
//根据关键词在不同文档中的出现次数，排序搜索结果，次数越高的关联越大）
//倒排索引（关键词->文档id，用来查看关键词在哪些文档中出现过）
void Save_index(const All_News_Info& infos){WordSegmentation wordSeg; vector<string> results; std::unordered_map<string,std::unordered_set<string>> inverted_index;std::unordered_map<string,std::unordered_map<string,string>> forward_index;for(size_t i = 0;i < infos.titles.size();++i){//把有关标题，作者和内容的分词结果放到倒排索引中results = wordSeg(infos.titles[i]);for(auto it:results)inverted_index[it].insert(infos.ids[i]);results = wordSeg(infos.contents[i]);for(auto it:results)inverted_index[it].insert(infos.ids[i]);results = wordSeg(infos.authors[i]);for(auto it:results)inverted_index[it].insert(infos.ids[i]);//把对应id下的所有信息放到正排索引中forward_index[infos.ids[i]]["title"] = infos.titles[i];forward_index[infos.ids[i]]["author"] = infos.authors[i];forward_index[infos.ids[i]]["content"] = infos.contents[i];forward_index[infos.ids[i]]["date"] = infos.dates[i];}//将json数据保存到文件中Json::Value root1,root2;for(auto it:inverted_index){string str = "";for(auto it2 :it.second)str += it2 + " ";root1[it.first] = str; }for(auto it:forward_index){Json::Value partner;for(auto it2:it.second)partner[it2.first] = it2.second;root2[it.first] = partner;}Json::StyledWriter sw;std::ofstream os1,os2;os1.open("inverted_index.json");os2.open("forward_index.json");os1 << sw.write(root1);os2 << sw.write(root2);os1.close();os2.close();
}int main(){All_News_Info infos;SearchData(infos);Save_index(infos);return 0;
}

二.封装对索引文件的操作接口（为之后的索引服务器提供）

Load接口：将索引文件加载到内存当中
Find接口：给一个参数keyword，对这个keyword进行分词，并对所有的分词结果，在倒排索引中查找，在哪些文件中出现过（这个时候可能一个文件出现了不止一次，所以要记下每个文件出现的次数）
Sort接口：对所有出现过的id根据出现次数进行排序达到关联度匹配的目的（出现次数越高的关联度越高，放在越前面），这里本质上就是对一个map进行按value的排序
ReturnInfo接口：现在拿到了一组排序好的id，这个时候根据这组id通过正排索引文件，拿到它们的内容，然后返回

下面是设计接口的程序代码

search_engine.h

#pragma once#include <iostream>
#include <fstream>
#include <unordered_map>
#include <unordered_set>
#include <vector>
#include <boost/algorithm/string.hpp>
#include <jsoncpp/json/json.h>
#include "word_segmentation.hpp"using std::string;
struct News_Info{string title;string author;string date;News_Info(string title_,string author_,string date_):title(title_),author(author_),date(date_){}
};void Load( std::unordered_map<string,std::unordered_set<string>>& inverted_index,std::unordered_map<string,std::unordered_map<string,string>>& forward_index);bool Find(std::unordered_map<string,std::unordered_set<string>> inverted_index,string keyword,std::unordered_map<string,int>& ids);vector<std::pair<string,int> > Sort(std::unordered_map<string,int> ids);vector<News_Info> ReturnInfo(const vector<std::pair<string,int> >& sort_ids,std::unordered_map<string,std::unordered_map<string,string>> forward_index);

search_engine

#include "search_engine.h"//将索引文件加载到哈希表
void Load( std::unordered_map<string,std::unordered_set<string>>& inverted_index,std::unordered_map<string,std::unordered_map<string,string>>& forward_index){Json::Reader reader;Json::Value value;std::ifstream is1,is2;is1.open("/home/pigff/project/search_engine/inverted_index.json");is2.open("/home/pigff/project/search_engine/forward_index.json");reader.parse(is1,value);vector<string> names = value.getMemberNames();for(auto it:names){vector<string> v;string tmp = value[it].asString();boost::algorithm::split(v,tmp,boost::algorithm::is_space());for(auto it2:v)inverted_index[it].insert(it2);}value.clear();reader.parse(is2,value);names.clear();names = value.getMemberNames();for(auto it:names){vector<string> names_names = value[it].getMemberNames();for(auto it2:names_names)forward_index[it].insert(make_pair(it2,value[it][it2].asString()));}
}//对一个搜索的关键词进行分词
bool Find(std::unordered_map<string,std::unordered_set<string>> inverted_index,string keyword,std::unordered_map<string,int>& ids){//对给进来的关键字进行分词WordSegmentation wordSeg;vector<string> results = wordSeg(keyword);for(auto it:results){if(inverted_index[it].empty())continue;else{for(auto it2: inverted_index[it]){//下面的判断是因为对文件的分割结果有可能有一个是空if(!it2.empty())ids[it2]++;}}}if(ids.empty())return false;return true;
}//对搜索出的新闻结果按照匹配度排序
//本质上就是对一个map进行按value的排序
//因为sort只可以对顺序容器进行排序
//所以我们要先把数据存到一个vector中
//采用的方式是对sort算法采用一个仿函数Compare
class Compare{
public:bool operator()(const std::pair<string,int>& x, const std::pair<string,int>& y) {return x.second < y.second;         }
};
vector<std::pair<string,int> > Sort(std::unordered_map<string,int> ids){vector<std::pair<string,int> > ret(ids.begin(),ids.end());sort(ret.begin(),ret.end(),Compare());return ret;
}//根据最终排序好的id取出对于的结构体数组
vector<News_Info> ReturnInfo(const vector<std::pair<string,int> >& sort_ids,std::unordered_map<string,std::unordered_map<string,string>> forward_index){vector<News_Info> ret;for(auto i:sort_ids){News_Info news_info(forward_index[i.first]["title"],forward_index[i.first]["author"],forward_index[i.first]["date"]);ret.push_back(news_info);}return ret;
}

三.制作搜索引擎服务器以及搜索客户端接口

这里我们采用百度的brpc开源框架来搭建我们的服务器（也是因为我想用一下别人的的开源框架=_=）
根据brpc框架，我们需要先制作一个.proto文件（brpc是基于protobuf的，具体怎么制作还请去brpc官网看教程，或者我在后续的学习中，也会说到）
其次其实就是实现一个brpc的搜索服务器了，这里具体怎么制作不过多阐释，都可以去官网找找例子，另外官方文档都有中文版哦，这个服务器是放到后台一直执行的。
分装一个客户端接口给我们的搜索CGI实现，在HTTP服务器拿到一个关键字后，通过这个客户端接口与服务器交互

下面是具体程序代码

SG.proto

syntax = "proto2";
// 告诉protoc要生成++ Service基类
option cc_generic_services = true;package SG;           // 定义了package的名字message Request {optional string keyword = 1;
};message Response {repeated Info info = 1; 
};message Info{optional  string  title   = 1;optional  string  author  = 2;optional  string  date    = 3;
};service Service {rpc Search(Request) returns (Response);
};

server

#include <gflags/gflags.h>
#include <brpc/server.h>
#include <butil/logging.h>
#include <brpc/stream.h>
#include "/home/pigff/project/search_engine/search_engine.h"
#include "SG.pb.h"DEFINE_bool(attachment, true, "Echo attachment as well");      
DEFINE_int32(port, 9999, "TCP Port of this server");
DEFINE_int32(idle_timeout_s, -1, "Connection will be closed if there is no ""read/write operations during the last `idle_timeout_s'");
DEFINE_int32(logoff_ms, 2000, "Maximum duration of server's LOGOFF state ""(waiting for client to close connection before server stops)");namespace Search{
//实现proto中的Service基类
class SearchService : public SG::Service{
public:SearchService(){//在构造函数中调用Load加载好索引文件Load(inverted_index,forward_index);   }void Search(google::protobuf::RpcController* cntl_base,const SG::Request* req,SG::Response* resp,google::protobuf::Closure* done){//这个对象确保在return时自动调用done->Run()brpc::ClosureGuard done_guard(done);brpc::Controller* cntl = static_cast<brpc::Controller*>(cntl_base);//输出日志了解客户端如何与服务器交互LOG(INFO) << "Received request[log_id=" << cntl->log_id() << "] from " << cntl->remote_side() << " to " << cntl->local_side()<< ": " << req->keyword()<< " (attached=" << cntl->request_attachment() << ")";//在倒排哈希表中根据关键词分词查找std::unordered_map<string,int> ids;// 如果找不到则响应的几个参数都不设置，都为空if(Find(inverted_index,req->keyword(),ids) == false)return;//对搜索出的结果进行关联度排序vector<std::pair<string,int> > sort_ids = Sort(ids);//取出排序好的id的相关内容vector<News_Info> infos = ReturnInfo(sort_ids,forward_index); for(size_t i = 0;i < infos.size();++i)resp->add_info();//给响应写东西for(int i = 0;i < resp->info_size();++i){SG::Info* info = resp->mutable_info(i);info->set_title(infos[i].title);info->set_author(infos[i].author);info->set_date(infos[i].date);}if(FLAGS_attachment){//设置连接到网络的附件而不是被序列化的protobuf信息cntl->response_attachment().append(cntl->request_attachment()); }}    
private:std::unordered_map<string,std::unordered_set<string>> inverted_index;std::unordered_map<string,std::unordered_map<string,string>> forward_index;
};}//end namespaceint main(int argc,char* argv[]){daemon(1,1);//解析GFLAGSgflags::ParseCommandLineFlags(&argc,&argv,true);//服务器对象brpc::Server server;//proto服务的实例Search::SearchService search_service;//将服务添加到服务器中//第二个参数是因为服务放在堆栈上，我们不希望服务器删除它//如果想要删除可以用brpc::SERVER_OWNS_SERVICEif(server.AddService(&search_service,brpc::SERVER_DOESNT_OWN_SERVICE) != 0){LOG(ERROR) << "Fail to start SearchServer";return -1;}// Start the server.brpc::ServerOptions option;option.idle_timeout_sec = FLAGS_idle_timeout_s;if (server.Start(FLAGS_port, &option) != 0) {LOG(ERROR) << "Fail to start EchoServer";return -1;}//直到按下Ctrl-c，才停止服务器server.RunUntilAskedToQuit();return 0;
}

client.h

#pragma once #include <gflags/gflags.h>
#include <brpc/channel.h>
#include <butil/time.h>
#include <butil/logging.h>
#include <brpc/stream.h>
#include <boost/algorithm/string.hpp>
#include "/home/pigff/project/search_server/SG.pb.h"
#include "/home/pigff/project/search_engine/search_engine.h"using std::string;DECLARE_string(protocol);
DECLARE_string(search_attachment);
DECLARE_string(connection_type);
DECLARE_string(search_server);
DECLARE_string(load_balancer);
DECLARE_int32(timeout_ms);
DECLARE_int32(max_retry); 
DECLARE_int32(interval_ms);
DECLARE_string(http_content_type);class Client{
public:Client();vector<News_Info> Return(string keyword);
private://客户端对象brpc::Channel channel;//proto服务的实例brpc::ChannelOptions options;
};

client

#include "client.h"DEFINE_string(protocol, "baidu_std", "Protocol type. Defined in src/brpc/options.proto");
DEFINE_string(search_attachment, "foo", "Carry this along with requests");
DEFINE_string(connection_type, "", "Connection type. Available values: single, pooled, short");
DEFINE_string(search_server, "0.0.0.0:9999", "IP Address of server");
DEFINE_string(load_balancer, "", "The algorithm for load balancing");
DEFINE_int32(timeout_ms, 3000, "RPC timeout in milliseconds");
DEFINE_int32(max_retry, 3, "Max retries(not including the first RPC)"); 
DEFINE_int32(interval_ms, 5000, "Milliseconds between consecutive requests");
DEFINE_string(http_content_type, "application/json", "Content type of http request");Client::Client(){options.protocol = FLAGS_protocol;options.connection_type = FLAGS_connection_type;options.timeout_ms = FLAGS_timeout_ms;options.max_retry = FLAGS_max_retry;channel.Init(FLAGS_search_server.c_str(),FLAGS_load_balancer.c_str(),&options); 
}
vector<News_Info> Client::Return(string keyword){// Normally, you should not call a Channel directly, but instead construct// a stub Service wrapping it. stub can be shared by all threads as well.SG::Service_Stub stub(&channel);SG::Request req;SG::Response resp;brpc::Controller cntl;req.set_keyword(keyword);if (FLAGS_protocol != "http" && FLAGS_protocol != "h2c")  {// Set attachment which is wired to network directly instead of // being serialized into protobuf messagestl.request_attachment().append(FLAGS_search_attachment);} else {cntl.http_request().set_content_type(FLAGS_http_content_type);}// Because `done'(last parameter) is NULL, this function waits until// the response comes back or error occurs(including timedout).stub.Search(&cntl, &req, &resp, NULL);vector<News_Info> v;if (!cntl.Failed()) {//写回数据给客户端for(auto i: resp.info()){News_Info info(i.title(),i.author(),i.date());v.push_back(info);}}else LOG(WARNING) << cntl.ErrorText();return v;
}

更多推荐

基于百度brpc框架的简易站内搜索引擎

本文发布于:2024-02-14 08:57:56，感谢您对本站的认可！

本文链接:https://www.elefans.com/category/jswz/34/1762661.html