先上一个在线邮箱采集demo样例:http://www.jsanai/emailcollect.html
这样的在线工具原理与普通的客户端工具(例如八鱼采集工具等)是一样的,所以这里以这个在线工具作为参考进行实现分析。
邮箱采集原理:
1、根据要采集的url地址,获取页面html内容,然后采用正则匹配出页面的url列表、邮箱地址列表。
2、分两个进程:
①保存邮箱地址;
②分析采集子页面url的邮箱地址;
基本源码(golang):
//采集入口方法
func CollectEmail(hosturl string) (EmailObj, []string, error) {
emailObj := new(EmailObj)
var inhost []string
//获取主域名
uparse, err := url.Parse(hosturl)
if err != nil {
return *emailObj, inhost, err
}
emailObj.Surl = hosturl
//
bodystr, err := HttpGetV2(hosturl)
if err != nil {
return *emailObj, inhost, errors.New("get request error")
}
//是否是gbk编码
pos := strings.Index(bodystr, "charset=gb")
pos2 := strings.Index(bodystr, "bg2312")
if pos != -1 || pos2 != -1 {
decodeBytes, err := simplifiedchinese.GB18030.NewDecoder().Bytes([]byte(bodystr))
if err != nil {
return *emailObj, inhost, errors.New("simplifiedchinese coding change error")
}
bodystr = string(decodeBytes)
}
//获取邮箱地地址
emailObj.Emails = append(emailObj.Emails, matchEmail(bodystr)...)
//获取联系手机
emailObj.Phones = append(emailObj.Phones, matchPhone(bodystr)...)
//获取内页链接列表
matchUrls := matchUrls(bodystr)
for _, item := range matchUrls {
itemparse, err := url.Parse(item)
if err != nil {
continue
}
if strings.Index(itemparse.Path, ".js") != -1 || strings.Index(itemparse.Path, ".css") != -1 {
continue
}
if itemparse.Host == uparse.Host {
inhost = append(inhost, item)
}
if itemparse.Scheme != "http" && itemparse.Scheme != "https" {
if strings.Index(itemparse.Path, "/") == 0 {
inhost = append(inhost, uparse.Scheme+"://"+uparse.Host+itemparse.Path)
} else {
inhost = append(inhost, uparse.Scheme+"://"+uparse.Host+"/"+itemparse.Path)
}
continue
}
}
//获取内页email
inhost = RemoveRepeatedElement(inhost)
emailObj.Emails = RemoveRepeatedElement(emailObj.Emails)
return *emailObj, inhost, nil
}
func matchEmail(str string) (email []string) {
var emailList []string
//re, _ := regexp.Compile("\\<style[\\S\\s]+?\\</style\\>")
re, _ := regexp.Compile(`<style[\S\s]+?</style>`)
str = re.ReplaceAllString(str, "")
//re, _ = regexp.Compile("\\<script[\\S\\s]+?\\</script\\>")
re, _ = regexp.Compile(`<script[\S\s]+?</script>`)
str = re.ReplaceAllString(str, "")
//替换html标签
re, _ = regexp.Compile(`<[^>]*?>`)
str = re.ReplaceAllString(str, "")
//只匹配com com cn org org net
reg := regexp.MustCompile(`\w+[@|#]{1}\w+\.(com|cn|org|net|org\|com\)`)
match := reg.FindAllStringSubmatch(str, -1)
for _, matched := range match {
emailList = append(emailList, strings.Replace(strings.ToLower(matched[0]), "#", "@", -1))
}
return emailList[:]
}
func matchUrls(str string) (urls []string) {
var urlList []string
reg := regexp.MustCompile("<a[^>]*?href=[\"|']+([^\"]*?)[\"|'][^>]*?>[^<]*?</a>")
match := reg.FindAllStringSubmatch(str, -1)
for _, matched := range match {
urlList = append(urlList, matched[1])
}
return urlList[:]
}
源码只实现了核心采集功能,表层应用层及ui页面自行设计,前端可以通过websocket链接实现与后端交互实现采集定制开发。这里只提供基本的功能实现及原理,爬虫及反爬虫的相关知识,可以根据需求进行定制开发。欢迎留言交流。
更多推荐
全网邮箱email地址采集api接口及实现分析
发布评论