【可搜索范围】
1. 正常网页搜索
数据:URL、标题,描述
2. 百科(百度,360)
数据:URL、标题,描述,作者,发布日期
3. 微博
数据:URL、标题,描述,作者,发布日期,微博类型(新浪、腾讯、163、搜狐),评论数,转发数
4. 新闻
数据:URL、标题,描述,发布日期,发布网站
5. 论坛
数据:URL、标题,描述,发布日期,发布网站
6. 博客
数据:URL、标题,描述,发布日期,发布网站
【原理】
1. 获取各大搜索引擎,url及分页规则。
2. 结构化解析,参考:http://blog.csdn/a286352250/article/details/14520643
各大搜索引擎【URL规则配置,分页规则】最新更新:2013-11-08
public class SeoConfiguration {
public SeoConfiguration() {
super();
}
public SeoConfiguration(String name, String url, String pageParam, String query, Integer pageIndex) {
super();
this.name = name;
this.url = url;
this.pageParam = pageParam;
this.query = query;
this.pageIndex = pageIndex;
}
public SeoConfiguration(Integer id, String name, String url, String pageParam, String query, Integer pageIndex) {
super();
this.id = id;
this.name = name;
this.url = url;
this.pageParam = pageParam;
this.query = query;
this.pageIndex = pageIndex;
}
private Integer id;
private String name;
private String url;
private String pageParam;
private String query;
private Integer pageIndex;
public final static String NAME_SOSO="soso";
public final static String NAME_BAIDU="baidu";
public final static String NAME_GOOGLE="google";
public final static String NAME_BING_WEB="bingweb";
public final static String NAME_SOGOU="sogou";
public final static String NAME_YOUDAO="youdao";
public final static String NAME_360="360";
public final static String NAME_BAIDU_WEIBO="baidu_weibo";
public final static String NAME_BAIDU_BBS="baidu_bbs";
public final static String NAME_BAIDU_BLOG="baidu_blog";
public final static String NAME_BAIDU_NEWS="baidu_news";
public final static String NAME_360_NEWS="360_news";
public final static String NAME_BAIDU_BAIKE="baidu_baike";
public final static String NAME_360_BAIKE="360_baike";
public final static String NAME_GOOGLE_BLOG="google_blog";
public final static String NAME_BING_YINGXIANG="bing_yingxiang";
//分页 - pn , 10倍数 , 默认:0
private final static String baidu_url="http://www.baidu/s?ie=utf-8&usm=6&rsv_page=1&wd=";
//分页 - start , 10倍数 , 默认:0
private final static String google_url="http://ajax.googleapis/ajax/services/search/web?v=2.0&rsz=large&q=";
//分页 - pg , 顺序 , 默认:1
private final static String soso_url="http://www.soso/q?sc=web&ch=w.uf&num=10&w=";
//分页 - Offset , 顺序 , 默认:0
private final static String bing_web_url="http://cn.bing/search?go=&qs=bs&first=1&FORM=PORE&q=";
//分页 - page , 顺序 , 默认:1
private final static String sogou_url="http://www.sogou/web?query=";
//分页 - Offset , 顺序 , 默认:1
private final static String youdao_url="http://www.youdao/search?ue=utf8&keyfrom=web.nextPage×ort=0&q=";
//分页 - pn , 10倍数 , 默认1
private final static String qihu360_url="http://www.so/s?j=0&q=";
//分页 - pn , 20倍数 , 默认0
private final static String baidu_weibo_url="http://www.baidu/s?cl=2&tn=baiduwb&rn=20&ie=utf-8&rtt=2&wd=";
//分页 - pn , 10倍数 , 默认0
private final static String baidu_bbs_url="http://www.baidu/s?pbs=1&tn=baidurt&bsst=1&ie=utf-8&rtt=1&wd=";
//分页 - pn , 10倍数 , 默认0
private final static String baidu_blog_url="http://www.baidu/s?tn=baidurt&rtt=1&pbl=1&pbs=0&bsst=1&ie=utf-8&wd=";
//分页 - pn , 20倍数 , 默认0
private final static String baidu_news_url="http://news.baidu/ns?bt=0&et=0&si=&rn=20&tn=news&ie=utf-8&ct=1&cl=2&word=";
//分页 - pn , 10倍数 , 默认1
private final static String baidu_360_url="http://news.so/ns?tn=news&rank=rank&q=";
//分页 - pn , 20倍数 , 默认0
private final static String baidu_baike_url="http://baike.baidu/search?type=0&pn=0&rn=10&submit=search&word=";
//分页 - pn , 顺序 , 默认1
private final static String qihu360_baike_url="http://baike.so/search/?word=";
//分页 - start , 10倍数 , 默认:0
private final static String google_blog_url="http://ajax.googleapis/ajax/services/search/blogs?v=2.0&rsz=large&q=";
//分页 - Offset , 顺序 , 默认:0
private final static String bing_yingxiang_url="http://cn.bing/yingxiangli/search?qs=n&form=BSCTAB&scope=q&sc=0-0&sp=-1&sk=&q=";
public static SeoConfiguration generateBaiduConfiguration(){
return new SeoConfiguration(NAME_BAIDU, baidu_url, "&pn=", null, 0);
}
public static SeoConfiguration generateGoogleConfiguration(){
return new SeoConfiguration(NAME_GOOGLE, google_url, "&start=", null, 0);
}
public static SeoConfiguration generateSosoConfiguration(){
return new SeoConfiguration(NAME_SOSO, soso_url, "&pg=", null, 1);
}
public static SeoConfiguration generateBingWebConfiguration(){
return new SeoConfiguration(NAME_BING_WEB, bing_web_url, "&first=", null, 1);
}
public static SeoConfiguration generateSogouConfiguration(){
return new SeoConfiguration(NAME_SOGOU, sogou_url, "&page=", null, 1);
}
public static SeoConfiguration generateYoudaoConfiguration(){
return new SeoConfiguration(NAME_YOUDAO, youdao_url, "&start=", null, 1);
}
public static SeoConfiguration generate360Configuration(){
return new SeoConfiguration(NAME_360, qihu360_url, "&pn=", null, 1);
}
public static SeoConfiguration generateBaiduWeiboConfiguration(){
return new SeoConfiguration(NAME_BAIDU_WEIBO, baidu_weibo_url, "&pn=", null, 0);
}
public static SeoConfiguration generateBaiduBBSConfiguration(){
return new SeoConfiguration(NAME_BAIDU_BBS, baidu_bbs_url, "&pn=", null, 0);
}
public static SeoConfiguration generateBaiduBlogConfiguration() {
return new SeoConfiguration(NAME_BAIDU_BLOG, baidu_blog_url, "&pn=", null, 0);
}
public static SeoConfiguration generateBaiduNewsConfiguration(){
return new SeoConfiguration(NAME_BAIDU_NEWS, baidu_news_url, "&pn=", null, 0);
}
public static SeoConfiguration generate360NewsConfiguration(){
return new SeoConfiguration(NAME_360_NEWS, baidu_360_url, "&pn=", null, 1);
}
public static SeoConfiguration generateBaiduBaikeConfiguration(){
return new SeoConfiguration(NAME_BAIDU_BAIKE, baidu_baike_url, "&pn=", null, 0);
}
public static SeoConfiguration generate360BaikeConfiguration(){
return new SeoConfiguration(NAME_360_BAIKE, qihu360_baike_url, "&p=", null, 1);
}
public static SeoConfiguration generateGoogleBbsConfiguration(){
return new SeoConfiguration(NAME_GOOGLE_BLOG, google_blog_url, "&start=", null, 0);
}
public static SeoConfiguration generateBingYingXiangConfiguration(){
return new SeoConfiguration(NAME_BING_YINGXIANG, bing_yingxiang_url, "&first=", null, 1);
}
public void addPageIndex(){
if (NAME_SOSO.equals(name)) {
this.pageIndex++;
} else if (NAME_BAIDU.equals(name)) {
this.pageIndex+=10;
} else if (NAME_GOOGLE.equals(name)) {
this.pageIndex+=10;
} else if (NAME_BING_WEB.equals(name)) {
this.pageIndex+=10;
} else if (NAME_SOGOU.equals(name)) {
this.pageIndex++;
} else if (NAME_YOUDAO.equals(name)) {
this.pageIndex+=10;
} else if (NAME_360.equals(name)) {
this.pageIndex++;
} else if (NAME_BAIDU_WEIBO.equals(name)) {
this.pageIndex+=20;
} else if (NAME_BAIDU_NEWS.equals(name)) {
this.pageIndex+=20;
} else if (NAME_360_NEWS.equals(name)) {
this.pageIndex++;
} else if (NAME_BAIDU_BAIKE.equals(name)) {
this.pageIndex+=20;
} else if (NAME_360_BAIKE.equals(name)) {
this.pageIndex++;
} else if (NAME_GOOGLE_BLOG.equals(name)) {
this.pageIndex+=10;
} else if (NAME_BAIDU_BLOG.equals(name)) {
this.pageIndex+=10;
} else if (NAME_BAIDU_BBS.equals(name)) {
this.pageIndex+=10;
} else if (NAME_BING_YINGXIANG.equals(name)) {
this.pageIndex+=10;
}
}
public String generateRequestUrl(){
return url+generateURLEncoder()+pageParam+pageIndex;
}
public String generateRequestUrl(String enc){
return url+generateURLEncoder(enc)+pageParam+pageIndex;
}
private String generateURLEncoder(){
return generateURLEncoder("utf-8");
}
private String generateURLEncoder(String enc){
try {
return URLEncoder.encode(query, enc);
} catch (UnsupportedEncodingException e) {
throw new RuntimeException(e);
}
}
// ------------------- getter and setter -----------------------------------------------------------------------
public Integer getId() {
return id;
}
public void setId(Integer id) {
this.id = id;
}
public String getUrl() {
return url;
}
public void setUrl(String url) {
this.url = url;
}
public String getQuery() {
return query;
}
public void setQuery(String query) {
this.query = query;
}
public String getName() {
return name;
}
public void setName(String name) {
this.name = name;
}
public String getPageParam() {
return pageParam;
}
public void setPageParam(String pageParam) {
this.pageParam = pageParam;
}
public Integer getPageIndex() {
return pageIndex;
}
public void setPageIndex(Integer pageIndex) {
this.pageIndex = pageIndex;
}
}
【结构化对象】
public class SeoResult {
public SeoResult() {
super();
}
/**
* 网页
*/
public SeoResult(String url, String title, String description) {
super();
this.url = url;
this.title = title;
this.description = description;
}
/**
* 新闻
*/
public SeoResult(String url, String title, String description, String gdSource, Long pubtime) {
super();
this.url = url;
this.title = title;
this.description = description;
this.gdSource = gdSource;
this.pubtime = pubtime.intValue();
}
/**
* 微博
*/
public SeoResult(String url, String description, String author, Integer weiboType, Long pingTotal, Long transTotal) {
super();
this.url = url;
this.description = description;
this.author = author;
this.weiboType = weiboType;
this.pingTotal = pingTotal;
this.transTotal = transTotal;
}
/**
* 论坛
*/
public SeoResult(String url, String description, String author, String BBSTypeStr) {
this.url = url;
this.description = description;
this.title = author;
this.gdSource = BBSTypeStr;
}
/**
* 影响力
*/
public SeoResult(String url, String description, Integer weiboType, String influence, Integer pubtime) {
super();
this.url = url;
this.description = description;
this.weiboType = weiboType;
this.influence = influence;
this.pubtime = pubtime;
}
private Integer id;
private String url;
private String title;
private String description;
/**
* 来源,转载
*/
private String gdSource;
/**
* 发布日期
*/
private Integer pubtime;
/**
* 作者
*/
private String author;
/**
* Sina = 1
* QQ = 2
* Sohu = 3
* 163 = 4
*/
private Integer weiboType;
/**
* 评论总数
*/
private Long pingTotal;
/**
* 转发总数
*/
private Long transTotal;
/**
* 影响力
*/
private String influence;
// ------------------------ private ----------------------------------------------------------------
public Integer getId() {
return id;
}
public void setId(Integer id) {
this.id = id;
}
public String getUrl() {
return url;
}
public void setUrl(String url) {
this.url = url;
}
public String getTitle() {
return title;
}
public void setTitle(String title) {
this.title = title;
}
public String getDescription() {
return description;
}
public void setDescription(String description) {
this.description = description;
}
public Integer getWeiboType() {
return weiboType;
}
public void setWeiboType(Integer weiboType) {
this.weiboType = weiboType;
}
public String getAuthor() {
return author;
}
public void setAuthor(String author) {
this.author = author;
}
public Long getPingTotal() {
return pingTotal;
}
public void setPingTotal(Long pingTotal) {
this.pingTotal = pingTotal;
}
public Long getTransTotal() {
return transTotal;
}
public void setTransTotal(Long transTotal) {
this.transTotal = transTotal;
}
public String getGdSource() {
return gdSource;
}
public void setGdSource(String gdSource) {
this.gdSource = gdSource;
}
public Integer getPubtime() {
return pubtime;
}
public void setPubtime(Integer pubtime) {
this.pubtime = pubtime;
}
public String getInfluence() {
return influence;
}
public void setInfluence(String influence) {
this.influence = influence;
}
}
【案例 -- 微博搜索】注:因页面太多,只提供微博搜索案例,其它请参考自行添加
public class BaiduWeiboClientServiceImpl implements SeoClientService {
public List<SeoResult> findResults(SeoConfiguration configuration) {
try {
List<SeoResult> seoResults=new ArrayList<SeoResult>();
System.out.println(configuration.generateRequestUrl());
Document doc = Jsoup.connect(configuration.generateRequestUrl()).get();
Elements eles = doc.select("div[id=wrapper] div[id=main] div[class=content_bg] div[class=content] ol[id=weibo] li");
for (Element element : eles) {
String url = element.select("div a[name=weibo_rootnick]").attr("href");
String description = element.select("div").text();
String author = element.select("div a[name=weibo_rootnick]").text();
String weiboTypeStr = element.select("div div[class=weibo_info] div[class=m] a").text();
String pubtime = weiboTypeStr.split(" ")[0];
weiboTypeStr = weiboTypeStr.split(" ")[1];
String pingTotalStr = element.select("div div[class=weibo_info] div[class=weibo_pz] a[name=weibo_ping]").text();
String transTotalStr = element.select("div div[class=weibo_info] div[class=weibo_pz] a[name=weibo_trans]").text();
weiboTypeStr = weiboTypeStr.substring(weiboTypeStr.indexOf("-")+1).trim();
Integer weiboType = null;
System.out.println(weiboTypeStr);
if ("新浪微博".equals(weiboTypeStr)) {
weiboType = 1;
} else if ("腾讯微博".equals(weiboTypeStr)) {
weiboType = 2;
} else if ("搜狐微博".equals(weiboTypeStr)) {
weiboType = 3;
} else if ("网易微博".equals(weiboTypeStr)) {
weiboType = 4;
}
Long pingTotal = Long.parseLong(pingTotalStr.substring(pingTotalStr.indexOf("(")+1, pingTotalStr.indexOf(")")).trim());
Long transTotal = Long.parseLong(transTotalStr.substring(transTotalStr.indexOf("(")+1, transTotalStr.indexOf(")")).trim());
SeoResult seoResult=new SeoResult(url, description, author, weiboType, pingTotal, transTotal);
if (!SeoResultFilter.filterSeoResult(seoResult)) {
continue;
}
seoResults.add(seoResult);
}
return seoResults;
} catch (Exception e) {
e.printStackTrace();
}
return new ArrayList<SeoResult>();
}
}
更多推荐
搜索引擎信息采集
发布评论