Thanks to visit codestin.com
Credit goes to github.com

Skip to content

Commit 871c39c

Browse files
committed
开源项目抓取
1 parent 882de2a commit 871c39c

File tree

10 files changed

+217
-10
lines changed

10 files changed

+217
-10
lines changed

websites/code/studygolang/src/config/config.go

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@ import (
1010
"encoding/json"
1111
"errors"
1212
"io/ioutil"
13+
"os"
1314
"path"
1415
"reflect"
1516
"strconv"
@@ -30,6 +31,11 @@ func init() {
3031
}
3132
ROOT = path.Dir(binDir) + "/"
3233

34+
if !strings.Contains(ROOT, "studygolang") {
35+
ROOT, _ = os.Getwd()
36+
ROOT = ROOT[:strings.Index(ROOT, "src")]
37+
}
38+
3339
// Load配置文件
3440
configFile := ROOT + "conf/config.json"
3541
content, err := ioutil.ReadFile(configFile)

websites/code/studygolang/src/model/openproject.go

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,10 @@ import (
1111
"util"
1212
)
1313

14+
const (
15+
ProjectStatusOnline = 1
16+
)
17+
1418
// 开源项目信息
1519
type OpenProject struct {
1620
Id int `json:"id" pk:"1"`

websites/code/studygolang/src/server/crawlarticle/autocrawl.go

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -14,11 +14,12 @@ import (
1414
"strings"
1515

1616
"config"
17-
"github.com/PuerkitoBio/goquery"
18-
"github.com/robfig/cron"
1917
"logger"
2018
"service"
2119
"util"
20+
21+
"github.com/PuerkitoBio/goquery"
22+
"github.com/robfig/cron"
2223
)
2324

2425
var websites = make(map[string]map[string]string)
@@ -50,6 +51,9 @@ func autocrawl(needAll bool, crawlConfFile string, whichSite string) {
5051
// 抓取 reddit
5152
go service.ParseReddit("")
5253

54+
// 抓取 www.oschina.net/project
55+
go service.ParseProjectList("http://www.oschina.net/project/lang/358/go?tag=0&os=0&sort=time")
56+
5357
for website, wbconf := range websites {
5458
if whichSite != "" && whichSite != website {
5559
continue

websites/code/studygolang/src/service/project.go

Lines changed: 155 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,13 +8,18 @@ package service
88

99
import (
1010
"errors"
11+
"math/rand"
1112
"net/url"
1213
"strconv"
1314
"strings"
15+
"time"
1416

1517
"logger"
1618
"model"
1719
"util"
20+
21+
"github.com/PuerkitoBio/goquery"
22+
"github.com/lunny/html2md"
1823
)
1924

2025
func PublishProject(user map[string]interface{}, form url.Values) (err error) {
@@ -211,6 +216,156 @@ func ProjectsTotal() (total int) {
211216
return
212217
}
213218

219+
// ParseProjectList 解析其他网站的开源项目
220+
func ParseProjectList(pUrl string) error {
221+
pUrl = strings.TrimSpace(pUrl)
222+
if !strings.HasPrefix(pUrl, "http") {
223+
pUrl = "http://" + pUrl
224+
}
225+
226+
var (
227+
doc *goquery.Document
228+
err error
229+
)
230+
231+
if doc, err = goquery.NewDocument(pUrl); err != nil {
232+
logger.Errorln("goquery opensource project newdocument error:", err)
233+
return err
234+
}
235+
236+
// 最后面的先入库处理
237+
projectsSelection := doc.Find(".ProjectList .List li")
238+
239+
for i := projectsSelection.Length() - 1; i >= 0; i-- {
240+
241+
contentSelection := goquery.NewDocumentFromNode(projectsSelection.Get(i)).Selection
242+
projectUrl, ok := contentSelection.Find("h3 a").Attr("href")
243+
244+
if !ok || projectUrl == "" {
245+
continue
246+
}
247+
err = ParseOneProject(projectUrl)
248+
249+
if err != nil {
250+
logger.Errorln(err)
251+
}
252+
}
253+
254+
return err
255+
}
256+
257+
const OsChinaDomain = "http://www.oschina.net"
258+
259+
var PresetUsernames = []string{"polaris", "blov", "agolangf", "xuanbao"}
260+
261+
// ParseOneProject 处理单个 project
262+
func ParseOneProject(projectUrl string) error {
263+
if !strings.HasPrefix(projectUrl, "http") {
264+
projectUrl = OsChinaDomain + projectUrl
265+
}
266+
267+
var (
268+
doc *goquery.Document
269+
err error
270+
)
271+
272+
if doc, err = goquery.NewDocument(projectUrl); err != nil {
273+
return errors.New("goquery fetch " + projectUrl + " error:" + err.Error())
274+
}
275+
276+
// 标题
277+
category := strings.TrimSpace(doc.Find(".Project .name").Text())
278+
name := strings.TrimSpace(doc.Find(".Project .name u").Text())
279+
tmpIndex := strings.LastIndex(category, name)
280+
if tmpIndex != -1 {
281+
category = category[:tmpIndex]
282+
}
283+
284+
// uri
285+
uri := projectUrl[strings.LastIndex(projectUrl, "/")+1:]
286+
287+
project := model.NewOpenProject()
288+
289+
err = project.Where("uri=?", uri).Find("id")
290+
// 已经存在
291+
if project.Id != 0 {
292+
return errors.New("url" + projectUrl + "has exists!")
293+
}
294+
295+
// 获取项目相关链接
296+
doc.Find("#Body .urls li").Each(func(i int, liSelection *goquery.Selection) {
297+
aSelection := liSelection.Find("a")
298+
uri := util.FetchRealUrl(OsChinaDomain + aSelection.AttrOr("href", ""))
299+
switch aSelection.Text() {
300+
case "软件首页":
301+
project.Home = uri
302+
case "软件文档":
303+
project.Doc = uri
304+
case "软件下载":
305+
project.Download = uri
306+
}
307+
})
308+
309+
ctime := util.TimeNow()
310+
311+
doc.Find("#Body .attrs li").Each(func(i int, liSelection *goquery.Selection) {
312+
aSelection := liSelection.Find("a")
313+
txt := aSelection.Text()
314+
if i == 0 {
315+
project.Licence = txt
316+
if txt == "未知" {
317+
project.Licence = "其他"
318+
}
319+
} else if i == 1 {
320+
project.Lang = txt
321+
} else if i == 2 {
322+
project.Os = txt
323+
} else if i == 3 {
324+
dtime, err := time.ParseInLocation("2006年01月02日", aSelection.Last().Text(), time.Local)
325+
if err != nil {
326+
logger.Errorln("parse ctime error:", err)
327+
} else {
328+
ctime = dtime.Local().Format("2006-01-02 15:04:05")
329+
}
330+
}
331+
})
332+
333+
project.Name = name
334+
project.Category = category
335+
project.Uri = uri
336+
project.Repo = strings.TrimSpace(doc.Find("#Body .github-widget").AttrOr("data-repo", ""))
337+
project.Src = "https://github.com/" + project.Repo
338+
project.Author = project.Repo[:strings.Index(project.Repo, "/")]
339+
340+
if project.Doc == "" {
341+
// TODO:暂时认为一定是 Go 语言
342+
project.Doc = "https://godoc.org/" + project.Src
343+
}
344+
345+
desc := ""
346+
doc.Find("#Body .detail").Find("p").NextAll().Each(func(i int, domSelection *goquery.Selection) {
347+
doc.FindSelection(domSelection).WrapHtml(`<div id="tmp` + strconv.Itoa(i) + `"></div>`)
348+
domHtml, _ := doc.Find("#tmp" + strconv.Itoa(i)).Html()
349+
if domSelection.Is("pre") {
350+
desc += domHtml + "\n\n"
351+
} else {
352+
desc += html2md.Convert(domHtml) + "\n\n"
353+
}
354+
})
355+
356+
project.Desc = strings.TrimSpace(desc)
357+
project.Username = PresetUsernames[rand.Intn(4)]
358+
project.Status = model.ProjectStatusOnline
359+
project.Ctime = ctime
360+
361+
_, err = project.Insert()
362+
if err != nil {
363+
return errors.New("insert into open project error:" + err.Error())
364+
}
365+
366+
return nil
367+
}
368+
214369
// 通过objid获得 project 的所有者
215370
func getProjectOwner(id int) int {
216371
project := model.NewOpenProject()
Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,15 @@
1+
package service_test
2+
3+
import (
4+
"testing"
5+
6+
"service"
7+
)
8+
9+
func TestParseProjectList(t *testing.T) {
10+
service.ParseProjectList("http://www.oschina.net/project/lang/358/go?tag=0&os=0&sort=view")
11+
}
12+
13+
func TestParseOneProject(t *testing.T) {
14+
service.ParseOneProject("http://www.oschina.net/p/docker")
15+
}

websites/code/studygolang/src/service/reddit.go

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -15,18 +15,19 @@ import (
1515
"strings"
1616
"time"
1717

18-
"github.com/PuerkitoBio/goquery"
1918
"logger"
2019
"model"
2120
"util"
21+
22+
"github.com/PuerkitoBio/goquery"
2223
)
2324

2425
const (
2526
Reddit = "http://www.reddit.com"
2627
RedditGolang = "/r/golang/new/"
2728
)
2829

29-
// 获取url对应的文章并根据规则进行解析
30+
// 获取url对应的资源并根据规则进行解析
3031
func ParseReddit(redditUrl string) error {
3132
redditUrl = strings.TrimSpace(redditUrl)
3233
if redditUrl == "" {

websites/code/studygolang/src/service/user_test.go

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -12,9 +12,9 @@ import (
1212
)
1313

1414
func TestFindUsersByPage(t *testing.T) {
15-
userList, total := FindUsersByPage()
16-
if total == nil && len(userList) == 0 {
17-
t.Fatal(err)
15+
userList, total := FindUsersByPage(make(map[string]string), 0, 10)
16+
if total == 0 && len(userList) == 0 {
17+
t.Fatal("")
1818
}
1919
t.Log(len(userList))
2020
for k, tmpUser := range userList {

websites/code/studygolang/src/util/http.go

Lines changed: 23 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,13 +7,35 @@
77
package util
88

99
import (
10-
"github.com/gorilla/context"
10+
"errors"
1111
"net/http"
12+
13+
"github.com/gorilla/context"
1214
)
1315

16+
// Redirect 重定向到指定的 uri
1417
func Redirect(rw http.ResponseWriter, req *http.Request, uri string) {
1518
// 避免跳转,context中没有清除
1619
context.Clear(req)
1720

1821
http.Redirect(rw, req, uri, http.StatusFound)
1922
}
23+
24+
// FetchRealUrl 获取链接真实的URL(获取重定向一次的结果URL)
25+
func FetchRealUrl(uri string) (realUrl string) {
26+
27+
client := &http.Client{
28+
CheckRedirect: func(req *http.Request, via []*http.Request) error {
29+
realUrl = req.URL.String()
30+
return errors.New("util fetch real url")
31+
},
32+
}
33+
34+
resp, err := client.Get(uri)
35+
if err != nil {
36+
return
37+
}
38+
defer resp.Body.Close()
39+
40+
return uri
41+
}

websites/code/studygolang/template/common/layout.html

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -120,7 +120,6 @@
120120
</p>
121121
<p class="text-center">
122122
<a href="http://www.ucai.cn?fr=studygolang" title="优才网" target="_blank"><img src="http://p3.ucai.cn/static/i3/mission/moblie_ucai_logo.png" alt="优才网" width="175px" height="45px" /></a>
123-
<a href="https://www.shiyanlou.com?fr=studygolang" title="实验楼" target="_blank"><img src="http://studygolang.qiniudn.com/ad/shiyanlou_logo.png" alt="实验楼" width="310px" height="45px" /></a>
124123
<a href="https://portal.qiniu.com/signup?code=3lfz4at7pxfma" title="七牛云存储" target="_blank"><img src="http://qiniutek.com/images/logo-2.png" alt="七牛云存储" width="290px" height="45px" /></a>
125124
</p>
126125
</div>
@@ -192,7 +191,7 @@ <h5>第三方账号登录</h5>
192191
<!--<script type="text/javascript" src="http://apps.bdimg.com/libs/jquery/1.11.1/jquery.min.js"></script>-->
193192
<script type="text/javascript" src="/static/js/libs/jquery-1.11.1.min.js"></script>
194193
<!--<script src="http://apps.bdimg.com/libs/bootstrap/3.2.0/js/bootstrap.min.js"></script>-->
195-
<script type="text/javascript" src="/static/js/libs//js/bootstrap-3.2.0.min.js"></script>
194+
<script type="text/javascript" src="/static/js/libs/bootstrap-3.2.0.min.js"></script>
196195
<script src="/static/js/libs/jquery.timeago.js"></script>
197196
<script src="/static/js/libs/jquery.timeago.zh-CN.js"></script>
198197
<script src="/static/js/libs/md5.js"></script>

websites/code/thirdparty/getpkg

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@ go get -u -v github.com/gorilla/sessions
1616
go get -u -v github.com/robfig/cron
1717
go get -u -v github.com/qiniu/api.v6
1818
go get -u -v github.com/dchest/captcha
19+
go get -u -v github.com/lunny/html2md
1920

2021
cp /etc/hosts ~/hosts
2122
#echo "golang.org 101.251.196.90" > /etc/hosts

0 commit comments

Comments
 (0)