-
Notifications
You must be signed in to change notification settings - Fork 3
/
main.go
125 lines (111 loc) · 3.21 KB
/
main.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
package main
import (
"context"
"fmt"
"log"
"regexp"
"time"
"github.com/chromedp/cdproto/target"
"github.com/chromedp/chromedp"
)
const (
_url = "https://weixin.sogou.com/weixin?p=01030402&query=%E8%85%BE%E8%AE%AF%E7%8E%84%E6%AD%A6%E5%AE%9E%E9%AA%8C%E5%AE%A4&type=1&ie=utf8"
_ua = "Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.103 Safari/537.36"
)
const (
_linkReg = `<br style="box-sizing: border-box;">(.+?)</a></span></span></p>`
_titleReg = `<q style="box-sizing: border-box;"><span style="font-size: 16px;">(.+?)</span>`
)
var (
_xLinkRegexp *regexp.Regexp
_xTitleRegexp *regexp.Regexp
)
func init() {
_xLinkRegexp = compileReg(_linkReg)
_xTitleRegexp = compileReg(_titleReg)
}
func compileReg(reg string) *regexp.Regexp {
compile, _ := regexp.Compile(reg)
return compile
}
func main() {
html := scrapeNewArticleHtml()
printArticle(html)
}
// scrapeNewArticleHtml 获取最新文章内容html
func scrapeNewArticleHtml() (html string) {
// 参数配置
options := []chromedp.ExecAllocatorOption{
chromedp.Flag("headless", false), // 是否打开浏览器调试
chromedp.UserAgent(_ua), // 设置User-Agent
}
options = append(chromedp.DefaultExecAllocatorOptions[:], options...)
allocCtx, cancel := chromedp.NewExecAllocator(context.Background(), options...)
defer cancel()
// 创建chrome实例
ctx, cancel := chromedp.NewContext(allocCtx)
defer cancel()
// 设置超时时间
ctx, cancel = context.WithTimeout(ctx, 15*time.Second)
defer cancel()
// 监听得到第二个tab页的target ID
ch := make(chan target.ID, 1)
chromedp.ListenTarget(ctx, func(ev interface{}) {
if ev, ok := ev.(*target.EventTargetCreated); ok &&
// if OpenerID == "", this is the first tab.
ev.TargetInfo.OpenerID != "" {
ch <- ev.TargetInfo.TargetID
}
})
var body string
if err := chromedp.Run(ctx,
chromedp.Tasks{
// 打开导航
chromedp.Navigate(_url),
// 等待元素加载完成
chromedp.WaitVisible("body"),
// 延迟2秒
chromedp.Sleep(2 * time.Second),
// 点击事件
chromedp.Click(`a[uigs="account_article_0"]`, chromedp.NodeVisible),
chromedp.Sleep(3 * time.Second),
// 获取html
chromedp.OuterHTML("html", &body, chromedp.ByQuery),
},
); err != nil {
log.Printf("[scrapeNewArticle] chromedp Run fail,err: %s", err.Error())
return
}
// 第二个tab页
newCtx, cancel := chromedp.NewContext(ctx, chromedp.WithTargetID(<-ch))
defer cancel()
if err := chromedp.Run(
newCtx,
chromedp.Sleep(1*time.Second),
chromedp.OuterHTML("#js_content", &html, chromedp.ByID),
); err != nil {
log.Printf("[scrapeNewArticle] chromedp Run fail,err: %s", err.Error())
return
}
return html
}
// printArticle 正则获取文章并且打印
func printArticle(html string) {
if html == "" {
return
}
var titleList, linkList [][]string
linkList = _xLinkRegexp.FindAllStringSubmatch(html, -1)
titleList = _xTitleRegexp.FindAllStringSubmatch(html, -1)
if len(linkList) != len(titleList) {
return
}
for i := 0; i < len(linkList); i++ {
if len(linkList[i]) > 0 && len(titleList[i]) > 0 {
link := linkList[i][1]
title := titleList[i][1]
fmt.Println("link: ", link)
fmt.Println("title: ", title)
}
}
}