Thanks to visit codestin.com
Credit goes to github.com

Skip to content

Commit 8b806bd

Browse files
committed
抓取重构
1 parent 61a2e94 commit 8b806bd

File tree

4 files changed

+59
-79
lines changed

4 files changed

+59
-79
lines changed

websites/code/studygolang/src/config/config.go

Lines changed: 15 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -8,11 +8,14 @@ package config
88

99
import (
1010
"encoding/json"
11+
"errors"
1112
"io/ioutil"
1213
"path"
13-
"process"
14+
"reflect"
1415
"strconv"
1516
"strings"
17+
18+
"process"
1619
)
1720

1821
// 项目根目录
@@ -44,14 +47,23 @@ const Gt = ">"
4447

4548
type Conf map[string]interface{}
4649

47-
func ParseConfig(filename string) (Conf, error) {
50+
func ParseConfig(filename string, store interface{}) (Conf, error) {
4851
content, err := ioutil.ReadFile(ROOT + filename)
4952
if err != nil {
5053
return nil, err
5154
}
5255

5356
var conf Conf
54-
err = json.Unmarshal(content, &conf)
57+
if store == nil {
58+
store = &conf
59+
} else {
60+
storeType := reflect.TypeOf(store)
61+
if storeType.Kind() != reflect.Ptr {
62+
return nil, errors.New("store must be pointer or nil")
63+
}
64+
}
65+
66+
err = json.Unmarshal(content, store)
5567
if err != nil {
5668
return nil, err
5769
}

websites/code/studygolang/src/server/crawlarticle/autocrawl.go

Lines changed: 22 additions & 66 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@ package main
88

99
import (
1010
"fmt"
11+
"log"
1112
"regexp"
1213
"strings"
1314

@@ -19,87 +20,42 @@ import (
1920
"util"
2021
)
2122

22-
var websites = map[string]map[string]string{
23-
"cnblogs": {
24-
"all_url": "http://zzk.cnblogs.com/s?t=b&w=",
25-
"incr_url": "http://zzk.cnblogs.com/s?t=b&dateRange=One-Week&w=",
26-
"keywords": "golang,go语言",
27-
"listselector": "#searchResult .searchItem", // 搜索结果一项的选择器
28-
"resultselector": "h3 a",
29-
"page_field": "p",
30-
"max_page": "30",
31-
},
32-
"csdn": {
33-
"all_url": "http://so.csdn.net/so/search/s.do?t=blog&q=",
34-
"incr_url": "http://so.csdn.net/so/search/s.do?t=blog&q=",
35-
"keywords": "go,golang,golang语言,go语言",
36-
"listselector": ".search-list",
37-
"resultselector": "dt a",
38-
"page_field": "p",
39-
"max_page": "13",
40-
},
41-
"oschina": {
42-
"all_url": "http://www.oschina.net/search?scope=blog&q=",
43-
"incr_url": "http://www.oschina.net/search?scope=blog&sort_by_time=1&q=",
44-
"keywords": "go,golang",
45-
"listselector": "#results li",
46-
"resultselector": "h3 a",
47-
"page_field": "p",
48-
"max_page": "50",
49-
},
50-
"oschina_translate": {
51-
"all_url": "http://www.oschina.net/search?scope=translate&q=",
52-
"incr_url": "http://www.oschina.net/search?scope=translate&sort_by_time=1&q=",
53-
"keywords": "go,golang",
54-
"listselector": "#results li",
55-
"resultselector": "h3 a",
56-
"page_field": "p",
57-
"max_page": "50",
58-
},
59-
"iteye": {
60-
"all_url": "http://www.iteye.com/search?type=blog&query=",
61-
"incr_url": "http://www.iteye.com/search?type=blog&sort=created_at&query=",
62-
"keywords": "go语言,golang",
63-
"listselector": "#search_result .topic",
64-
"resultselector": ".content h4 a",
65-
"page_field": "page",
66-
"max_page": "20",
67-
},
68-
"iteye_news": {
69-
"all_url": "http://www.iteye.com/search?type=news&query=",
70-
"incr_url": "http://www.iteye.com/search?type=news&sort=created_at&query=",
71-
"keywords": "go语言,golang",
72-
"listselector": "#search_result .topic",
73-
"resultselector": ".content h4 a",
74-
"page_field": "page",
75-
"max_page": "20",
76-
},
77-
}
23+
var websites = make(map[string]map[string]string)
7824

7925
const pattern = "go|golang|goroutine|channel/i"
8026

81-
func autocrawl(needAll bool) {
27+
func autocrawl(needAll bool, crawlConfFile string, whichSite string) {
28+
29+
_, err := config.ParseConfig(crawlConfFile, &websites)
30+
if err != nil {
31+
log.Fatalln("parse crawl config error:", err)
32+
}
8233

8334
if needAll {
8435
// 全量
8536
for website, wbconf := range websites {
37+
if whichSite != "" && whichSite != website {
38+
continue
39+
}
40+
8641
logger.Infoln("all crawl", website)
8742
go doCrawl(wbconf, true)
8843
}
8944
}
9045

9146
// 定时增量
9247
c := cron.New()
93-
c.AddFunc(config.Config["crawl_spec"], startCrawl)
94-
c.Start()
95-
}
96-
97-
func startCrawl() {
48+
c.AddFunc(config.Config["crawl_spec"], func() {
49+
for website, wbconf := range websites {
50+
if whichSite != "" && whichSite != website {
51+
continue
52+
}
9853

99-
for website, wbconf := range websites {
100-
logger.Infoln("do crawl", website)
101-
go doCrawl(wbconf, false)
102-
}
54+
logger.Infoln("do crawl", website)
55+
go doCrawl(wbconf, false)
56+
}
57+
})
58+
c.Start()
10359
}
10460

10561
func doCrawl(wbconf map[string]string, isAll bool) {

websites/code/studygolang/src/server/crawlarticle/main.go

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -28,11 +28,17 @@ func init() {
2828
}
2929

3030
func main() {
31-
var needAll bool
31+
var (
32+
needAll bool
33+
crawConfFilename string
34+
whichSite string
35+
)
3236
flag.BoolVar(&needAll, "all", false, "是否需要全量抓取,默认否")
37+
flag.StringVar(&crawConfFilename, "config", "conf/auto_crawl_conf.json", "自动抓取配置文件")
38+
flag.StringVar(&whichSite, "site", "", "抓取配置中哪个站点(空表示所有配置站点)")
3339
flag.Parse()
3440

35-
go autocrawl(needAll)
41+
go autocrawl(needAll, crawConfFilename, whichSite)
3642

3743
router := initRouter()
3844
http.Handle("/", router)

websites/code/studygolang/src/service/article.go

Lines changed: 14 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -76,15 +76,21 @@ func ParseArticle(articleUrl string, auto bool) (*model.Article, error) {
7676
author = urlPaths[index]
7777
authorTxt = author
7878
} else {
79-
authorSelection := doc.Find(rule.Author)
80-
author, err = authorSelection.Html()
81-
if err != nil {
82-
logger.Errorln("goquery parse author error:", err)
83-
return nil, err
79+
if strings.HasPrefix(rule.Author, ".") || strings.HasPrefix(rule.Author, "#") {
80+
authorSelection := doc.Find(rule.Author)
81+
author, err = authorSelection.Html()
82+
if err != nil {
83+
logger.Errorln("goquery parse author error:", err)
84+
return nil, err
85+
}
86+
87+
author = strings.TrimSpace(author)
88+
authorTxt = strings.TrimSpace(authorSelection.Text())
89+
} else {
90+
// 某些个人博客,页面中没有作者的信息,因此,规则中 author 即为 作者
91+
author = rule.Author
92+
authorTxt = rule.Author
8493
}
85-
86-
author = strings.TrimSpace(author)
87-
authorTxt = strings.TrimSpace(authorSelection.Text())
8894
}
8995

9096
title := ""

0 commit comments

Comments
 (0)