Thanks to visit codestin.com
Credit goes to github.com

Skip to content

Commit 7664b38

Browse files
committed
增加个人博客自动抓取功能
1 parent a719684 commit 7664b38

File tree

1 file changed

+54
-25
lines changed
  • websites/code/studygolang/src/server/crawlarticle

1 file changed

+54
-25
lines changed

websites/code/studygolang/src/server/crawlarticle/autocrawl.go

Lines changed: 54 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@ import (
1010
"fmt"
1111
"log"
1212
"regexp"
13+
"strconv"
1314
"strings"
1415

1516
"config"
@@ -22,7 +23,7 @@ import (
2223

2324
var websites = make(map[string]map[string]string)
2425

25-
const pattern = "go|golang|goroutine|channel/i"
26+
const pattern = "(?i)go|golang|goroutine|channel"
2627

2728
func autocrawl(needAll bool, crawlConfFile string, whichSite string) {
2829

@@ -64,7 +65,6 @@ func doCrawl(wbconf map[string]string, isAll bool) {
6465
crawlUrl = wbconf["all_url"]
6566
}
6667

67-
keywords := strings.Split(wbconf["keywords"], ",")
6868
listselector := wbconf["listselector"]
6969
resultselector := wbconf["resultselector"]
7070
pageField := wbconf["page_field"]
@@ -74,39 +74,68 @@ func doCrawl(wbconf map[string]string, isAll bool) {
7474
maxPage = util.MustInt(wbconf["max_page"])
7575
}
7676

77-
var (
78-
doc *goquery.Document
79-
err error
80-
)
77+
// 个人博客,一般通过 tag 方式获取,这种处理方式和搜索不一样
78+
if wbconf["keywords"] == "" {
79+
for p := maxPage; p >= 1; p-- {
80+
if pageField == "" {
81+
82+
// 标题不包含 go 等关键词的,也入库
83+
if err := parseArticleList(crawlUrl+strconv.Itoa(p), listselector, resultselector, false); err != nil {
84+
break
85+
}
86+
}
87+
}
88+
89+
return
90+
}
91+
92+
keywords := strings.Split(wbconf["keywords"], ",")
8193

8294
for _, keyword := range keywords {
8395
for p := 1; p <= maxPage; p++ {
8496

8597
page := fmt.Sprintf("&%s=%d", pageField, p)
86-
logger.Infoln("parse url:", crawlUrl+keyword+page)
87-
if doc, err = goquery.NewDocument(crawlUrl + keyword + page); err != nil {
98+
if err := parseArticleList(crawlUrl+keyword+page, listselector, resultselector, true); err != nil {
99+
logger.Errorln("parse article url error:", err)
88100
break
89101
}
102+
}
103+
}
104+
}
90105

91-
doc.Find(listselector).Each(func(i int, contentSelection *goquery.Selection) {
106+
func parseArticleList(url, listselector, resultselector string, isAuto bool) (err error) {
92107

93-
aSelection := contentSelection.Find(resultselector)
94-
title := aSelection.Text()
95-
matched, err := regexp.MatchString(pattern, title)
96-
if err != nil {
97-
logger.Errorln(err)
98-
return
99-
}
108+
logger.Infoln("parse url:", url)
100109

101-
if !matched {
102-
return
103-
}
110+
var doc *goquery.Document
104111

105-
articleUrl, ok := aSelection.Attr("href")
106-
if ok {
107-
service.ParseArticle(articleUrl, true)
108-
}
109-
})
110-
}
112+
if doc, err = goquery.NewDocument(url); err != nil {
113+
return
111114
}
115+
116+
doc.Find(listselector).Each(func(i int, contentSelection *goquery.Selection) {
117+
118+
aSelection := contentSelection.Find(resultselector)
119+
120+
if isAuto {
121+
title := aSelection.Text()
122+
123+
matched, err := regexp.MatchString(pattern, title)
124+
if err != nil {
125+
logger.Errorln(err)
126+
return
127+
}
128+
129+
if !matched {
130+
return
131+
}
132+
}
133+
134+
articleUrl, ok := aSelection.Attr("href")
135+
if ok {
136+
service.ParseArticle(articleUrl, isAuto)
137+
}
138+
})
139+
140+
return
112141
}

0 commit comments

Comments
 (0)