-
Notifications
You must be signed in to change notification settings - Fork 9
/
Copy pathparse_test.go
37 lines (33 loc) · 11.6 KB
/
parse_test.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
package main
import (
"net/url"
"strings"
"testing"
"github.com/PuerkitoBio/goquery"
)
var lzlTotalCommentTestString = `<li class="lzl_single_post j_lzl_s_p first_no_border" data-field='{"spid":132237789491,"user_name":"\u4ed6\u4eec\u597d\u5435\u554a","portrait":"tb.1.479b0e40.FbTmCIoclxiBD5JouhBQMA"}' ><a rel="noopener" name="132237789491"></a> <a rel="noopener" data-field='{"un":"\u4ed6\u4eec\u597d\u5435\u554a","id":"tb.1.479b0e40.FbTmCIoclxiBD5JouhBQMA"}' target="_blank" class="j_user_card lzl_p_p" href="/home/main?un=%E4%BB%96%E4%BB%AC%E5%A5%BD%E5%90%B5%E5%95%8A&ie=utf-8&id=tb.1.479b0e40.FbTmCIoclxiBD5JouhBQMA&fr=pb" username="他们好吵啊"><img src="https://gss0.bdstatic.com/6LZ1dD3d1sgCo2Kml5_Y_D3/sys/portrait/item/tb.1.479b0e40.FbTmCIoclxiBD5JouhBQMA"/></a><div class="lzl_cnt" data-field='{"iconArr":[],"free_flag":null}'><a rel="noopener" class="at j_user_card " data-field='{"un":"\u4ed6\u4eec\u597d\u5435\u554a","id":"tb.1.479b0e40.FbTmCIoclxiBD5JouhBQMA"}' href="/home/main/?un=%E4%BB%96%E4%BB%AC%E5%A5%BD%E5%90%B5%E5%95%8A&ie=utf-8&id=tb.1.479b0e40.FbTmCIoclxiBD5JouhBQMA&fr=pb" target="_blank" username="他们好吵啊">终极闪耀赛罗✨</a>:<span class="lzl_content_main" data-username=""> 回复 <a href="http://tieba.baidu.com/i/sys/jump?un=" onclick="Stats.sendRequest('fr=tb0_forum&st_mod=pb&st_value=atlink');" onmouseover="showattip(this)" onmouseout="hideattip(this)" username="" portrait="tb.1.fdadc6ce.2U-v9D4pWAY7LIrg8E-EOw" target="_blank" class="at">阿比酱最棒啦💖</a> :说起来纸片人搞饭圈这一套就nm离谱<img class="BDE_Smiley" width="30" height="30" changedsize="false" src="https://gsp0.baidu.com/5aAHeD3nKhI2p27j8IqW0jdnxx1xbK/tb/editor/images/client/image_emoticon16.png" > </span><div class="lzl_content_reply"><span class="lzl_jb" style="display:none;"></span><span class="lzl_op_list j_lzl_o_l" style="display:none;"></span><span class="lzl_time">2020-5-17 22:37</span><a rel="noopener" href="#" class="lzl_s_r">回复</a></div></div></li><li class="lzl_single_post j_lzl_s_p " data-field='{"spid":132237796500,"user_name":"\u4ed6\u4eec\u597d\u5435\u554a","portrait":"tb.1.479b0e40.FbTmCIoclxiBD5JouhBQMA"}' ><a rel="noopener" name="132237796500"></a> <a rel="noopener" data-field='{"un":"\u4ed6\u4eec\u597d\u5435\u554a","id":"tb.1.479b0e40.FbTmCIoclxiBD5JouhBQMA"}' target="_blank" class="j_user_card lzl_p_p" href="/home/main?un=%E4%BB%96%E4%BB%AC%E5%A5%BD%E5%90%B5%E5%95%8A&ie=utf-8&id=tb.1.479b0e40.FbTmCIoclxiBD5JouhBQMA&fr=pb" username="他们好吵啊"><img src="https://gss0.bdstatic.com/6LZ1dD3d1sgCo2Kml5_Y_D3/sys/portrait/item/tb.1.479b0e40.FbTmCIoclxiBD5JouhBQMA"/></a><div class="lzl_cnt" data-field='{"iconArr":[],"free_flag":null}'><a rel="noopener" class="at j_user_card " data-field='{"un":"\u4ed6\u4eec\u597d\u5435\u554a","id":"tb.1.479b0e40.FbTmCIoclxiBD5JouhBQMA"}' href="/home/main/?un=%E4%BB%96%E4%BB%AC%E5%A5%BD%E5%90%B5%E5%95%8A&ie=utf-8&id=tb.1.479b0e40.FbTmCIoclxiBD5JouhBQMA&fr=pb" target="_blank" username="他们好吵啊">终极闪耀赛罗✨</a>:<span class="lzl_content_main" data-username=""> 回复 <a href="http://tieba.baidu.com/i/sys/jump?un=" onclick="Stats.sendRequest('fr=tb0_forum&st_mod=pb&st_value=atlink');" onmouseover="showattip(this)" onmouseout="hideattip(this)" username="" portrait="tb.1.fdadc6ce.2U-v9D4pWAY7LIrg8E-EOw" target="_blank" class="at">阿比酱最棒啦💖</a> :算了,大佬打架我这萌新还是稍稍吧<img class="BDE_Smiley" width="30" height="30" changedsize="false" src="https://gsp0.baidu.com/5aAHeD3nKhI2p27j8IqW0jdnxx1xbK/tb/editor/images/client/image_emoticon25.png" > </span><div class="lzl_content_reply"><span class="lzl_jb" style="display:none;"></span><span class="lzl_op_list j_lzl_o_l" style="display:none;"></span><span class="lzl_time">2020-5-17 22:37</span><a rel="noopener" href="#" class="lzl_s_r">回复</a></div></div></li><li class="lzl_single_post j_lzl_s_p " data-field='{"spid":132240827250,"user_name":"lonelyrangers","portrait":"tb.1.41c90085.pOmHdZ2UOe-_Na778rOFhA"}' ><a rel="noopener" name="132240827250"></a> <a rel="noopener" data-field='{"un":"lonelyrangers","id":"tb.1.41c90085.pOmHdZ2UOe-_Na778rOFhA"}' target="_blank" class="j_user_card lzl_p_p" href="/home/main?un=lonelyrangers&ie=utf-8&id=tb.1.41c90085.pOmHdZ2UOe-_Na778rOFhA&fr=pb" username="lonelyrangers"><img src="https://gss0.bdstatic.com/6LZ1dD3d1sgCo2Kml5_Y_D3/sys/portrait/item/tb.1.41c90085.pOmHdZ2UOe-_Na778rOFhA"/></a><div class="lzl_cnt" data-field='{"iconArr":{"all_level":{"2":{"end_time":"1608785198","level":2,"pic_url":"https:\/\/imgsa.baidu.com\/forum\/pic\/item\/6afa80cb39dbb6fdf9de234d0b24ab18962b37f0.jpg","score_limit":8000}},"level":{"end_time":"1608785198","pic_url":"https:\/\/imgsa.baidu.com\/forum\/pic\/item\/6afa80cb39dbb6fdf9de234d0b24ab18962b37f0.jpg","props_id":2}},"free_flag":null}'><a rel="noopener" class="at j_user_card " data-field='{"un":"lonelyrangers","id":"tb.1.41c90085.pOmHdZ2UOe-_Na778rOFhA"}' href="/home/main/?un=lonelyrangers&ie=utf-8&id=tb.1.41c90085.pOmHdZ2UOe-_Na778rOFhA&fr=pb" target="_blank" username="lonelyrangers">lonelyrangers</a>:<span class="lzl_content_main" data-username=""> 你这形容得太有画面感了<img class="BDE_Smiley" width="30" height="30" changedsize="false" src="https://gsp0.baidu.com/5aAHeD3nKhI2p27j8IqW0jdnxx1xbK/tb/editor/images/client/image_emoticon25.png" > </span><div class="lzl_content_reply"><span class="lzl_jb" style="display:none;"></span><span class="lzl_op_list j_lzl_o_l" style="display:none;"></span><span class="lzl_time">2020-5-18 02:12</span><a rel="noopener" href="#" class="lzl_s_r">回复</a></div></div></li><li class="lzl_single_post j_lzl_s_p " data-field='{"spid":132252322102,"user_name":"\u73b0\u4ee3\u6dd1\u5973\u96be\u6c42\u554a","portrait":"tb.1.a95746d1.CnZDCm-zX0WTKmf_9-iJYA"}' ><a rel="noopener" name="132252322102"></a> <a rel="noopener" data-field='{"un":"\u73b0\u4ee3\u6dd1\u5973\u96be\u6c42\u554a","id":"tb.1.a95746d1.CnZDCm-zX0WTKmf_9-iJYA"}' target="_blank" class="j_user_card lzl_p_p" href="/home/main?un=%E7%8E%B0%E4%BB%A3%E6%B7%91%E5%A5%B3%E9%9A%BE%E6%B1%82%E5%95%8A&ie=utf-8&id=tb.1.a95746d1.CnZDCm-zX0WTKmf_9-iJYA&fr=pb" username="现代淑女难求啊"><img src="https://gss0.bdstatic.com/6LZ1dD3d1sgCo2Kml5_Y_D3/sys/portrait/item/tb.1.a95746d1.CnZDCm-zX0WTKmf_9-iJYA"/></a><div class="lzl_cnt" data-field='{"iconArr":[],"free_flag":null}'><a rel="noopener" class="at j_user_card " data-field='{"un":"\u73b0\u4ee3\u6dd1\u5973\u96be\u6c42\u554a","id":"tb.1.a95746d1.CnZDCm-zX0WTKmf_9-iJYA"}' href="/home/main/?un=%E7%8E%B0%E4%BB%A3%E6%B7%91%E5%A5%B3%E9%9A%BE%E6%B1%82%E5%95%8A&ie=utf-8&id=tb.1.a95746d1.CnZDCm-zX0WTKmf_9-iJYA&fr=pb" target="_blank" username="现代淑女难求啊">现代淑女难求啊</a>:<span class="lzl_content_main" data-username=""> 回复 <a href="http://tieba.baidu.com/i/sys/jump?un=" onclick="Stats.sendRequest('fr=tb0_forum&st_mod=pb&st_value=atlink');" onmouseover="showattip(this)" onmouseout="hideattip(this)" username="" portrait="tb.1.fdadc6ce.2U-v9D4pWAY7LIrg8E-EOw" target="_blank" class="at">阿比酱最棒啦💖</a> :是秦武阳<img class="BDE_Smiley" width="30" height="30" changedsize="false" src="https://gsp0.baidu.com/5aAHeD3nKhI2p27j8IqW0jdnxx1xbK/tb/editor/images/client/image_emoticon25.png" > </span><div class="lzl_content_reply"><span class="lzl_jb" style="display:none;"></span><span class="lzl_op_list j_lzl_o_l" style="display:none;"></span><span class="lzl_time">2020-5-18 17:06</span><a rel="noopener" href="#" class="lzl_s_r">回复</a></div></div></li><li class="lzl_single_post j_lzl_s_p " data-field='{"spid":132252333701,"user_name":"470355389","portrait":"tb.1.fdadc6ce.2U-v9D4pWAY7LIrg8E-EOw"}' ><a rel="noopener" name="132252333701"></a> <a rel="noopener" data-field='{"un":"470355389","id":"tb.1.fdadc6ce.2U-v9D4pWAY7LIrg8E-EOw"}' target="_blank" class="j_user_card lzl_p_p" href="/home/main?un=470355389&ie=utf-8&id=tb.1.fdadc6ce.2U-v9D4pWAY7LIrg8E-EOw&fr=pb" username="470355389"><img src="https://gss0.bdstatic.com/6LZ1dD3d1sgCo2Kml5_Y_D3/sys/portrait/item/tb.1.fdadc6ce.2U-v9D4pWAY7LIrg8E-EOw"/></a><div class="lzl_cnt" data-field='{"iconArr":{"all_level":{"2":{"end_time":"1617015024","level":2,"pic_url":"https:\/\/imgsa.baidu.com\/forum\/pic\/item\/6afa80cb39dbb6fdf9de234d0b24ab18962b37f0.jpg","score_limit":8000}},"level":{"end_time":"1617015024","pic_url":"https:\/\/imgsa.baidu.com\/forum\/pic\/item\/6afa80cb39dbb6fdf9de234d0b24ab18962b37f0.jpg","props_id":2}},"free_flag":null}'><a rel="noopener" class="at j_user_card " data-field='{"un":"470355389","id":"tb.1.fdadc6ce.2U-v9D4pWAY7LIrg8E-EOw"}' href="/home/main/?un=470355389&ie=utf-8&id=tb.1.fdadc6ce.2U-v9D4pWAY7LIrg8E-EOw&fr=pb" target="_blank" username="470355389">阿比酱最棒啦💖</a>:<span class="lzl_content_main" data-username=""> 回复 <a href="http://tieba.baidu.com/i/sys/jump?un=" onclick="Stats.sendRequest('fr=tb0_forum&st_mod=pb&st_value=atlink');" onmouseover="showattip(this)" onmouseout="hideattip(this)" username="" portrait="tb.1.a95746d1.CnZDCm-zX0WTKmf_9-iJYA" target="_blank" class="at">现代淑女难求啊</a> :啊这,丢人<img class="BDE_Smiley" width="30" height="30" changedsize="false" src="https://gsp0.baidu.com/5aAHeD3nKhI2p27j8IqW0jdnxx1xbK/tb/editor/images/client/image_emoticon11.png" > </span><div class="lzl_content_reply"><span class="lzl_jb" style="display:none;"></span><span class="lzl_op_list j_lzl_o_l" style="display:none;"></span><span class="lzl_time">2020-5-18 17:07</span><a rel="noopener" href="#" class="lzl_s_r">回复</a></div></div></li><li class="lzl_li_pager j_lzl_l_p lzl_li_pager_s" data-field='{"total_num":15,"total_page":2}' ><a rel="noopener" class="j_lzl_p btn-sub btn-small pull-right" href="##"><i class="icon-reply"></i>我也说一句</a> <p class="j_pager l_pager pager_theme_2"> <a href="#1">首页</a>
<a href="#1">上一页</a>
<a href="#1">1</a>
<span class="tP">2</span>
</p> </li>`
func TestTotalCommentParserFcn(t *testing.T) {
u := &url.URL{}
body := lzlTotalCommentTestString
commentParserFcn(u, body, HTMLLzl, func(key uint64, value *LzlContent) {
// special rule: remove username ahref in ": 回复 ", as requested in #4
strContent := string(value.Content)
content := strings.Trim(strContent, " ")
if strings.HasPrefix(content, "回复") {
doc, err := goquery.NewDocumentFromReader(strings.NewReader(content))
if err != nil {
t.Errorf("failed to parse comment data: %v, reason: %s", content, err)
}
bodyDOM := doc.Find("body")
s := doc.Find("a.at").First()
userNameHtml, _ := s.Html()
// t.Errorf(userNameHtml)
s.ReplaceWithHtml(userNameHtml)
t.Logf(bodyDOM.Html())
}
}, func(string, string, string) {})
}