Thanks to visit codestin.com
Credit goes to github.com

Skip to content

Commit 2246126

Browse files
author
xuxinhua
committed
定期抓取 reddit 上的go资源
1 parent e2b3139 commit 2246126

File tree

6 files changed

+197
-3
lines changed

6 files changed

+197
-3
lines changed
Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,23 @@
1+
// Copyright 2014 The StudyGolang Authors. All rights reserved.
2+
// Use of this source code is governed by a BSD-style
3+
// license that can be found in the LICENSE file.
4+
// http://studygolang.com http://golang.top
5+
// Author:polaris [email protected]
6+
7+
package api
8+
9+
import (
10+
"fmt"
11+
"net/http"
12+
"service"
13+
)
14+
15+
func AddRedditResourceHandler(rw http.ResponseWriter, req *http.Request) {
16+
err := service.ParseReddit(req.FormValue("url"))
17+
if err != nil {
18+
fmt.Fprint(rw, err)
19+
return
20+
}
21+
22+
fmt.Fprint(rw, "success")
23+
}

websites/code/studygolang/src/server/crawlarticle/autocrawl.go

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -47,6 +47,9 @@ func autocrawl(needAll bool, crawlConfFile string, whichSite string) {
4747
// 定时增量
4848
c := cron.New()
4949
c.AddFunc(config.Config["crawl_spec"], func() {
50+
// 抓取 reddit
51+
go service.ParseReddit("")
52+
5053
for website, wbconf := range websites {
5154
if whichSite != "" && whichSite != website {
5255
continue

websites/code/studygolang/src/server/crawlarticle/main.go

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -61,5 +61,6 @@ func initRouter() *mux.Router {
6161
router := mux.NewRouter()
6262

6363
router.HandleFunc("/", api.AddArticleHandler)
64+
router.HandleFunc("/reddit", api.AddRedditResourceHandler)
6465
return router
6566
}
Lines changed: 167 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,167 @@
1+
// Copyright 2013 The StudyGolang Authors. All rights reserved.
2+
// Use of this source code is governed by a BSD-style
3+
// license that can be found in the LICENSE file.
4+
// http://studygolang.com, http://golang.top
5+
// Author:polaris [email protected]
6+
7+
// 解析 http://www.reddit.com/r/golang 最新 Go 信息
8+
package service
9+
10+
import (
11+
"errors"
12+
"math/rand"
13+
"regexp"
14+
"strings"
15+
"time"
16+
17+
"github.com/PuerkitoBio/goquery"
18+
"logger"
19+
"model"
20+
"util"
21+
)
22+
23+
const (
24+
Reddit = "http://www.reddit.com"
25+
RedditGolang = "/r/golang"
26+
)
27+
28+
// 获取url对应的文章并根据规则进行解析
29+
func ParseReddit(redditUrl string) error {
30+
redditUrl = strings.TrimSpace(redditUrl)
31+
if redditUrl == "" {
32+
redditUrl = Reddit + RedditGolang
33+
} else if !strings.HasPrefix(redditUrl, "http") {
34+
redditUrl = "http://" + redditUrl
35+
}
36+
37+
var (
38+
doc *goquery.Document
39+
err error
40+
)
41+
42+
if doc, err = goquery.NewDocument(redditUrl); err != nil {
43+
logger.Errorln("goquery reddit newdocument error:", err)
44+
return err
45+
}
46+
47+
doc.Find("#siteTable .link").Each(func(i int, contentSelection *goquery.Selection) {
48+
49+
err = dealRedditOneResource(contentSelection)
50+
51+
if err != nil {
52+
logger.Errorln(err)
53+
}
54+
})
55+
56+
return err
57+
}
58+
59+
var PresetUids = []int{1, 1747, 1748, 1827}
60+
61+
var resourceRe = regexp.MustCompile(`\n\n`)
62+
63+
// 处理 Reddit 中的一条资源
64+
func dealRedditOneResource(contentSelection *goquery.Selection) error {
65+
aSelection := contentSelection.Find(".title a.title")
66+
67+
title := aSelection.Text()
68+
if title == "" {
69+
return errors.New("title is empty")
70+
}
71+
72+
resourceUrl, ok := aSelection.Attr("href")
73+
if !ok || resourceUrl == "" {
74+
return errors.New("resource url is empty")
75+
}
76+
77+
resource := model.NewResource()
78+
// Reddit 自身的内容
79+
if contentSelection.HasClass("self") {
80+
resourceUrl = Reddit + resourceUrl
81+
}
82+
83+
err := resource.Where("url=?", resourceUrl).Find("id")
84+
// 已经存在
85+
if resource.Id != 0 {
86+
return errors.New("url" + resourceUrl + "has exists!")
87+
}
88+
89+
if contentSelection.HasClass("self") {
90+
91+
resource.Form = model.ContentForm
92+
93+
var doc *goquery.Document
94+
95+
if doc, err = goquery.NewDocument(resourceUrl); err != nil {
96+
return errors.New("goquery reddit.com/r/golang self newdocument error:" + err.Error())
97+
}
98+
99+
content, err := doc.Find("#siteTable .usertext .md").Html()
100+
if err != nil {
101+
return err
102+
}
103+
104+
doc.Find(".commentarea .comment .usertext .md").Each(func(i int, contentSel *goquery.Selection) {
105+
if i == 0 {
106+
content += `<hr/>**评论:**<br/><br/>`
107+
}
108+
109+
comment, err := contentSel.Html()
110+
if err != nil {
111+
return
112+
}
113+
114+
comment = strings.TrimSpace(comment)
115+
comment = resourceRe.ReplaceAllLiteralString(comment, "\n")
116+
117+
author := contentSel.ParentsFiltered(".usertext").Prev().Find(".author").Text()
118+
content += author + ": <pre>" + comment + "</pre>"
119+
})
120+
121+
resource.Content = content
122+
123+
// reddit 本身的,当做其他资源
124+
resource.Catid = 4
125+
} else {
126+
resource.Form = model.LinkForm
127+
128+
// Github,是开源项目
129+
if contentSelection.Find(".title .domain a").Text() == "github.com" {
130+
resource.Catid = 2
131+
} else {
132+
resource.Catid = 1
133+
}
134+
}
135+
136+
resource.Title = title
137+
resource.Url = resourceUrl
138+
resource.Uid = PresetUids[rand.Intn(4)]
139+
140+
ctime := util.TimeNow()
141+
datetime, ok := contentSelection.Find(".tagline time").Attr("datetime")
142+
if ok {
143+
dtime, err := time.Parse(time.RFC3339, datetime)
144+
if err != nil {
145+
logger.Errorln("parse ctime error:", err)
146+
} else {
147+
ctime = dtime.Format("2006-01-02 15:04:05")
148+
}
149+
}
150+
resource.Ctime = ctime
151+
152+
var id int64
153+
id, err = resource.Insert()
154+
155+
if err != nil {
156+
return errors.New("insert into Resource error:" + err.Error())
157+
}
158+
159+
// 存扩展信息
160+
resourceEx := model.NewResourceEx()
161+
resourceEx.Id = int(id)
162+
if _, err = resourceEx.Insert(); err != nil {
163+
return errors.New("insert into ResourceEx error:" + err.Error())
164+
}
165+
166+
return nil
167+
}

websites/code/studygolang/template/admin/article/modify.html

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -132,7 +132,7 @@ <h3>基本信息</h3>
132132
<h3>操作</h3>
133133
</div>
134134
<div>
135-
<p>
135+
<p>
136136
<label>&nbsp;</label>
137137
<input type="submit" class="submit radius2" value="提交" />
138138
</p>
@@ -158,4 +158,4 @@ <h3>操作</h3>
158158
<script type="text/javascript" src="/static/js/libs/jquery.Huploadify.js"></script>
159159
<script type="text/javascript" src="/static/js/admin/forms.js"></script>
160160
<script type="text/javascript" src="/static/js/upload.js"></script>
161-
{{end}}
161+
{{end}}

websites/code/studygolang/template/index.html

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -30,7 +30,7 @@ <h3><a href="/resources">Golang 资源</a></h3>
3030
</div>
3131
<ul class="clearfix list-unstyled">
3232
{{range .resources}}
33-
<li><i></i><a href="/resources/{{.Id}}" title="{{.Title}}" target="_blank">{{substring .Title 30 "..."}}</a><span class="pull-right timeago" title="{{.Ctime}}"></span></li>
33+
<li><i></i><a href="/resources/{{.Id}}" title="{{.Title}}" target="_blank">{{substring .Title 45 "..."}}</a><span class="pull-right timeago" title="{{.Ctime}}"></span></li>
3434
{{end}}
3535
</ul>
3636
</div>

0 commit comments

Comments
 (0)