@@ -10,6 +10,7 @@ import (
10
10
"fmt"
11
11
"log"
12
12
"regexp"
13
+ "strconv"
13
14
"strings"
14
15
15
16
"config"
@@ -22,7 +23,7 @@ import (
22
23
23
24
var websites = make (map [string ]map [string ]string )
24
25
25
- const pattern = "go|golang|goroutine|channel/i "
26
+ const pattern = "(?i) go|golang|goroutine|channel"
26
27
27
28
func autocrawl (needAll bool , crawlConfFile string , whichSite string ) {
28
29
@@ -64,7 +65,6 @@ func doCrawl(wbconf map[string]string, isAll bool) {
64
65
crawlUrl = wbconf ["all_url" ]
65
66
}
66
67
67
- keywords := strings .Split (wbconf ["keywords" ], "," )
68
68
listselector := wbconf ["listselector" ]
69
69
resultselector := wbconf ["resultselector" ]
70
70
pageField := wbconf ["page_field" ]
@@ -74,39 +74,68 @@ func doCrawl(wbconf map[string]string, isAll bool) {
74
74
maxPage = util .MustInt (wbconf ["max_page" ])
75
75
}
76
76
77
- var (
78
- doc * goquery.Document
79
- err error
80
- )
77
+ // 个人博客,一般通过 tag 方式获取,这种处理方式和搜索不一样
78
+ if wbconf ["keywords" ] == "" {
79
+ for p := maxPage ; p >= 1 ; p -- {
80
+ if pageField == "" {
81
+
82
+ // 标题不包含 go 等关键词的,也入库
83
+ if err := parseArticleList (crawlUrl + strconv .Itoa (p ), listselector , resultselector , false ); err != nil {
84
+ break
85
+ }
86
+ }
87
+ }
88
+
89
+ return
90
+ }
91
+
92
+ keywords := strings .Split (wbconf ["keywords" ], "," )
81
93
82
94
for _ , keyword := range keywords {
83
95
for p := 1 ; p <= maxPage ; p ++ {
84
96
85
97
page := fmt .Sprintf ("&%s=%d" , pageField , p )
86
- logger . Infoln ( "parse url:" , crawlUrl + keyword + page )
87
- if doc , err = goquery . NewDocument ( crawlUrl + keyword + page ); err != nil {
98
+ if err := parseArticleList ( crawlUrl + keyword + page , listselector , resultselector , true ); err != nil {
99
+ logger . Errorln ( "parse article url error:" , err )
88
100
break
89
101
}
102
+ }
103
+ }
104
+ }
90
105
91
- doc . Find ( listselector ). Each ( func ( i int , contentSelection * goquery. Selection ) {
106
+ func parseArticleList ( url , listselector , resultselector string , isAuto bool ) ( err error ) {
92
107
93
- aSelection := contentSelection .Find (resultselector )
94
- title := aSelection .Text ()
95
- matched , err := regexp .MatchString (pattern , title )
96
- if err != nil {
97
- logger .Errorln (err )
98
- return
99
- }
108
+ logger .Infoln ("parse url:" , url )
100
109
101
- if ! matched {
102
- return
103
- }
110
+ var doc * goquery.Document
104
111
105
- articleUrl , ok := aSelection .Attr ("href" )
106
- if ok {
107
- service .ParseArticle (articleUrl , true )
108
- }
109
- })
110
- }
112
+ if doc , err = goquery .NewDocument (url ); err != nil {
113
+ return
111
114
}
115
+
116
+ doc .Find (listselector ).Each (func (i int , contentSelection * goquery.Selection ) {
117
+
118
+ aSelection := contentSelection .Find (resultselector )
119
+
120
+ if isAuto {
121
+ title := aSelection .Text ()
122
+
123
+ matched , err := regexp .MatchString (pattern , title )
124
+ if err != nil {
125
+ logger .Errorln (err )
126
+ return
127
+ }
128
+
129
+ if ! matched {
130
+ return
131
+ }
132
+ }
133
+
134
+ articleUrl , ok := aSelection .Attr ("href" )
135
+ if ok {
136
+ service .ParseArticle (articleUrl , isAuto )
137
+ }
138
+ })
139
+
140
+ return
112
141
}
0 commit comments