Thanks to visit codestin.com
Credit goes to github.com

Skip to content

Commit 1225c3a

Browse files
committed
自动抓取更通用
1 parent 71c5ea4 commit 1225c3a

File tree

4 files changed

+19
-12
lines changed

4 files changed

+19
-12
lines changed

config/env.sample.ini

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -54,9 +54,10 @@ https_domain = xxx
5454

5555
; 抓取程序
5656
[crawl]
57-
listen = 5050
5857
spec = 0 0 */1 * * ?
5958
iframe_deny = github.com,robbinfan.com,www.zhihu.com,google.com,golang.org,evernote.com,blogspot.com
59+
reddit_path = /r/golang/new/
60+
project_url = http://www.oschina.net/project/lang/358/go?tag=0&os=0&sort=time
6061

6162
[search]
6263
engine_url = http://127.0.0.1:7070/solr/studygolang

src/logic/auto_crawl.go

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -18,11 +18,12 @@ import (
1818
"strings"
1919

2020
"github.com/PuerkitoBio/goquery"
21+
"github.com/polaris1119/config"
2122
"github.com/polaris1119/logger"
2223
"golang.org/x/net/context"
2324
)
2425

25-
const titlePattern = "(?i)go|golang|goroutine|channel"
26+
var titlePattern = config.ConfigFile.MustValue("crawl", "article_title_pattern")
2627

2728
type AutoCrawlLogic struct{}
2829

@@ -32,7 +33,7 @@ func (self AutoCrawlLogic) DoCrawl(isAll bool) error {
3233
autoCrawlConfList := make([]*model.AutoCrawlRule, 0)
3334
err := MasterDB.Where("status=?", model.AutoCrawlOn).Find(&autoCrawlConfList)
3435
if err != nil {
35-
logger.Errorln("ArticleLogic FindBy Error:", err)
36+
logger.Errorln("AutoCrawlLogic FindBy Error:", err)
3637
return err
3738
}
3839

@@ -163,7 +164,7 @@ func (self AutoCrawlLogic) parseArticleList(strUrl string, autoCrawlConf *model.
163164
aSelection := contentSelection.Find(resultSelector)
164165

165166
// 搜索时,避免搜到垃圾,对标题进一步判断
166-
if isSearch {
167+
if isSearch && titlePattern != "" {
167168
title := aSelection.Text()
168169

169170
matched, err := regexp.MatchString(titlePattern, title)

src/logic/reddit.go

Lines changed: 12 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -20,24 +20,29 @@ import (
2020
"model"
2121

2222
"github.com/PuerkitoBio/goquery"
23+
"github.com/polaris1119/config"
2324
"github.com/polaris1119/logger"
2425
)
2526

2627
type RedditLogic struct {
2728
domain string
28-
golang string
29+
path string
2930
}
3031

31-
var DefaultReddit = RedditLogic{
32-
domain: "https://www.reddit.com",
33-
golang: "/r/golang/new/",
32+
var DefaultReddit = newRedditLogic()
33+
34+
func newRedditLogic() *RedditLogic {
35+
return &RedditLogic{
36+
domain: "https://www.reddit.com",
37+
path: config.ConfigFile.MustValue("crawl", "reddit_path"),
38+
}
3439
}
3540

3641
// Parse 获取url对应的资源并根据规则进行解析
3742
func (this *RedditLogic) Parse(redditUrl string) error {
3843
redditUrl = strings.TrimSpace(redditUrl)
3944
if redditUrl == "" {
40-
redditUrl = this.domain + this.golang
45+
redditUrl = this.domain + this.path
4146
} else if !strings.HasPrefix(redditUrl, "https") {
4247
redditUrl = "https://" + redditUrl
4348
}
@@ -126,7 +131,7 @@ func (this *RedditLogic) dealRedditOneResource(contentSelection *goquery.Selecti
126131
var doc *goquery.Document
127132

128133
if doc, err = goquery.NewDocument(resourceUrl); err != nil {
129-
return errors.New("goquery reddit.com/r/golang self newdocument error:" + err.Error())
134+
return errors.New("goquery reddit.com" + this.path + " self newdocument error:" + err.Error())
130135
}
131136

132137
content, err := doc.Find("#siteTable .usertext .md").Html()
@@ -152,7 +157,7 @@ func (this *RedditLogic) dealRedditOneResource(contentSelection *goquery.Selecti
152157
})
153158

154159
if strings.TrimSpace(content) == "" {
155-
return errors.New("goquery reddit.com/r/golang self newdocument(" + resourceUrl + ") error: content is empty")
160+
return errors.New("goquery reddit.com" + this.path + " self newdocument(" + resourceUrl + ") error: content is empty")
156161
}
157162

158163
resource.Content = content

src/server/crawler/main.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -55,7 +55,7 @@ func autocrawl(needAll bool, whichSite string) {
5555
go logic.DefaultReddit.Parse("")
5656

5757
// 抓取 www.oschina.net/project
58-
go logic.DefaultProject.ParseProjectList("http://www.oschina.net/project/lang/358/go?tag=0&os=0&sort=time")
58+
go logic.DefaultProject.ParseProjectList(config.ConfigFile.MustValue("crawl", "project_url"))
5959

6060
// 抓取 article
6161
go logic.DefaultAutoCrawl.DoCrawl(false)

0 commit comments

Comments
 (0)