3.4 运行规则分析

本节会给大家提供一个参考实例，用于告诉大家如何根据具体的业务实现自己的爬虫框架。

我们以公共规则中“阿里巴巴产品搜索”为例（这些公共的规则都在github.com/pholcus下面包含，大家可以参考下）。

package spider_lib

// 基础包
import (
	"github.com/PuerkitoBio/goquery"                        //DOM解析
	"github.com/henrylee2cn/pholcus/app/downloader/context" //必需
	. "github.com/henrylee2cn/pholcus/app/spider"           //必需
	. "github.com/henrylee2cn/pholcus/app/spider/common"    //选用
	"github.com/henrylee2cn/pholcus/logs"                   //信息输出

	// net包
	"net/http" //设置http.Header
	// "net/url"

	// 编码包
	// "encoding/xml"
	// "encoding/json"

	// 字符串处理包
	// "regexp"
	"strconv"
	"strings"

	// 其他包
	// "fmt"
	// "math"
	// "time"
)

func init() {
	AlibabaProduct.Register()
}

var AlibabaProduct = &Spider{
	Name:        "阿里巴巴产品搜索",
	Description: "阿里巴巴产品搜索 [s.1688.com/selloffer/offer_search.htm]",
	// Pausetime: 300,
	Keyword:      KEYWORD,
	MaxPage:      MAXPAGE,
	EnableCookie: false,
	RuleTree: &RuleTree{
		Root: func(ctx *Context) {
			ctx.Aid(map[string]interface{}{"loop": [2]int{0, 1}, "Rule": "生成请求"}, "生成请求")
		},

		Trunk: map[string]*Rule{

			"生成请求": {
				AidFunc: func(ctx *Context, aid map[string]interface{}) interface{} {
					keyword := EncodeString(ctx.GetKeyword(), "GBK")
					for loop := aid["loop"].([2]int); loop[0] < loop[1]; loop[0]++ {
						ctx.AddQueue(&context.Request{
							Url:    "http://s.1688.com/selloffer/offer_search.htm?enableAsync=false&earseDirect=false&button_click=top&pageSize=60&n=y&offset=3&uniqfield=pic_tag_id&keywords=" + keyword + "&beginPage=" + strconv.Itoa(loop[0]+1),
							Rule:   aid["Rule"].(string),
							Header: http.Header{"Content-Type": []string{"text/html", "charset=GBK"}},
						})
					}
					return nil
				},
				ParseFunc: func(ctx *Context) {
					query := ctx.GetDom()
					pageTag := query.Find("#sm-pagination div[data-total-page]")
					// 跳转
					if len(pageTag.Nodes) == 0 {
						logs.Log.Critical("[消息提示：| 任务：%v | 关键词：%v | 规则：%v] 由于跳转AJAX问题，目前只能每个子类抓取 1 页……\n", ctx.GetName(), ctx.GetKeyword(), ctx.GetRuleName())
						query.Find(".sm-floorhead-typemore a").Each(func(i int, s *goquery.Selection) {
							if href, ok := s.Attr("href"); ok {
								ctx.AddQueue(&context.Request{
									Url:    href,
									Header: http.Header{"Content-Type": []string{"text/html", "charset=GBK"}},
									Rule:   "搜索结果",
								})
							}
						})
						return
					}
					total1, _ := pageTag.First().Attr("data-total-page")
					total1 = strings.Trim(total1, " \t\n")
					total, _ := strconv.Atoi(total1)
					if total > ctx.GetMaxPage() {
						total = ctx.GetMaxPage()
					} else if total == 0 {
						logs.Log.Critical("[消息提示：| 任务：%v | 关键词：%v | 规则：%v] 没有抓取到任何数据！！！\n", ctx.GetName(), ctx.GetKeyword(), ctx.GetRuleName())
						return
					}

					// 调用指定规则下辅助函数
					ctx.Aid(map[string]interface{}{"loop": [2]int{1, total}, "Rule": "搜索结果"})
					// 用指定规则解析响应流
					ctx.Parse("搜索结果")
				},
			},

			"搜索结果": {
				//注意：有无字段语义和是否输出数据必须保持一致
				ItemFields: []string{
					"公司",
					"标题",
					"价格",
					"销量",
					"星级",
					"地址",
					"链接",
				},
				ParseFunc: func(ctx *Context) {
					query := ctx.GetDom()

					query.Find("#sm-offer-list > li").Each(func(i int, s *goquery.Selection) {

						// 获取公司
						company, _ := s.Find("a.sm-offer-companyName").First().Attr("title")

						// 获取标题
						t := s.Find(".sm-offer-title > a:nth-child(1)")
						title, _ := t.Attr("title")

						// 获取URL
						url, _ := t.Attr("href")

						// 获取价格
						price := s.Find(".sm-offer-priceNum").First().Text()

						// 获取成交量
						sales := s.Find("span.sm-offer-trade > em").First().Text()

						// 获取地址
						address, _ := s.Find(".sm-offer-location").First().Attr("title")

						// 获取信用年限
						level := s.Find("span.sm-offer-companyTag > a.sw-ui-flaticon-cxt16x16").First().Text()

						// 结果存入Response中转
						ctx.Output(map[int]interface{}{
							0: company,
							1: title,
							2: price,
							3: sales,
							4: level,
							5: address,
							6: url,
						})
					})
				},
			},
		},
	},
}

从代码中可以看到，总体上，我们实例化一个Spider类型的对象，如之前所讲，对象的Name是必须要有的：

	Name:        "阿里巴巴产品搜索",
	Description: "阿里巴巴产品搜索 [s.1688.com/selloffer/offer_search.htm]",
	// Pausetime: 300,
	Keyword:      KEYWORD,
	MaxPage:      MAXPAGE,
	EnableCookie: false,

这个结构的各种配置之前已经讲过，这里不赘述，如果需要调整，可以在这里赋值。

RuleTree正是我们的规则端的入口。Root: func可以认为程序在装载好任务后，首先执行的程序入口

	RuleTree: &RuleTree{
		Root: func(ctx *Context) {
			ctx.Aid(map[string]interface{}{"loop": [2]int{0, 1}, "Rule": "生成请求"}, "生成请求")
		},

这里我们看到，我们通过ctx.Aid函数给规则添加了第一个任务，就是执行key为"生成请求"的AidFunc: func函数，顺着代码，我们可以找到这个函数的代码：

			"生成请求": {
				AidFunc: func(ctx *Context, aid map[string]interface{}) interface{} {
					keyword := EncodeString(ctx.GetKeyword(), "GBK")
					for loop := aid["loop"].([2]int); loop[0] < loop[1]; loop[0]++ {
						ctx.AddQueue(&context.Request{
							Url:    "http://s.1688.com/selloffer/offer_search.htm?enableAsync=false&earseDirect=false&button_click=top&pageSize=60&n=y&offset=3&uniqfield=pic_tag_id&keywords=" + keyword + "&beginPage=" + strconv.Itoa(loop[0]+1),
							Rule:   aid["Rule"].(string),
							Header: http.Header{"Content-Type": []string{"text/html", "charset=GBK"}},
						})
					}
					return nil
				},
				ParseFunc: func(ctx *Context) {
					query := ctx.GetDom()
					pageTag := query.Find("#sm-pagination div[data-total-page]")
					// 跳转
					if len(pageTag.Nodes) == 0 {
						logs.Log.Critical("[消息提示：| 任务：%v | 关键词：%v | 规则：%v] 由于跳转AJAX问题，目前只能每个子类抓取 1 页……\n", ctx.GetName(), ctx.GetKeyword(), ctx.GetRuleName())
						query.Find(".sm-floorhead-typemore a").Each(func(i int, s *goquery.Selection) {
							if href, ok := s.Attr("href"); ok {
								ctx.AddQueue(&context.Request{
									Url:    href,
									Header: http.Header{"Content-Type": []string{"text/html", "charset=GBK"}},
									Rule:   "搜索结果",
								})
							}
						})
						return
					}
					total1, _ := pageTag.First().Attr("data-total-page")
					total1 = strings.Trim(total1, " \t\n")
					total, _ := strconv.Atoi(total1)
					if total > ctx.GetMaxPage() {
						total = ctx.GetMaxPage()
					} else if total == 0 {
						logs.Log.Critical("[消息提示：| 任务：%v | 关键词：%v | 规则：%v] 没有抓取到任何数据！！！\n", ctx.GetName(), ctx.GetKeyword(), ctx.GetRuleName())
						return
					}

					// 调用指定规则下辅助函数
					ctx.Aid(map[string]interface{}{"loop": [2]int{1, total}, "Rule": "搜索结果"})
					// 用指定规则解析响应流
					ctx.Parse("搜索结果")
				},
			},

可以看到，"生成请求"这个成员变量内部包含了两个函数AidFunc和ParseFunc，如之前的文章介绍的，通过Aid添加的任务由AidFunc解析，通过Addqueue函数添加的任务由ParseFunc负责解析，不同的是程序在执行ParseFunc之前会默认抓取Addqueue添加的的request请求，并把response添加到ctx中去。这就是为什么我们没有明显的看到URL请求，却能直接通过ctx.GetDom来获取Dom树了。此外，这里调用了Parse函数，执行过程是不解析url直接用当先的ctx执行ParseFunc函数。此外，代码中还有一些goquery的语法（建议参考W3c jquery相关语法）及golang本身的语法，这些大家可以自己研究下。

另外，我们可以看到一些关于输出的语句：

					ctx.Output(map[int]interface{}{
						0: title,
						1: ctx.GetTemp("description", ""),
						2: infoStr,
						3: ctx.GetTemp("releaseTime", ""),
						4: ctx.GetTemp("src", ""),
						5: ctx.GetTemp("author", ""),
					})

pholcus目前支持csv,mongo,mysql和excel四种数据存储方式，这些都可以在见面上手动设置。存储的方式统一都是已Key-Value数据的方式存储。

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!