Thanks to visit codestin.com
Credit goes to github.com

Skip to content

Commit ea58d15

Browse files
committed
抓取迁移
1 parent d74ef91 commit ea58d15

File tree

13 files changed

+615
-7
lines changed

13 files changed

+615
-7
lines changed
Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
11
env.ini
22
env.dev.ini
3-
env.pro.ini
3+
env.pro.ini
4+
auto_crawl.json

websites/code2/studygolang/install.sh

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@ BUILD="`git symbolic-ref HEAD | cut -b 12-`-`git rev-parse HEAD`"
1919

2020
go install -ldflags "-X global.Build="$BUILD server/studygolang
2121
go install server/indexer
22+
go install server/crawler
2223

2324
export GOPATH="$OLDGOPATH"
2425
export PATH="$OLDPATH"

websites/code2/studygolang/src/global/app.go

Lines changed: 13 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -34,17 +34,23 @@ import (
3434

3535
var Build string
3636

37-
var App = struct {
37+
type app struct {
3838
Name string
3939
Build string
4040
Version string
4141
Date time.Time
4242

43+
// 启动时间
44+
LaunchTime time.Time
45+
Uptime time.Duration
46+
4347
Env string
4448

4549
Host string
4650
Port string
47-
}{}
51+
}
52+
53+
var App = app{}
4854

4955
var showVersion = flag.Bool("version", false, "Print version of this binary")
5056

@@ -58,6 +64,7 @@ func init() {
5864
App.Name = os.Args[0]
5965
App.Version = "V2.0.0"
6066
App.Build = Build
67+
App.LaunchTime = time.Now()
6168

6269
fileInfo, err := os.Stat(os.Args[0])
6370
if err != nil {
@@ -69,6 +76,10 @@ func init() {
6976
App.Env = config.ConfigFile.MustValue("global", "env")
7077
}
7178

79+
func (this *app) SetUptime() {
80+
this.Uptime = time.Now().Sub(this.LaunchTime)
81+
}
82+
7283
func PrintVersion(w io.Writer) {
7384
if !flag.Parsed() {
7485
flag.Parse()

websites/code2/studygolang/src/http/controller/install.go

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -249,6 +249,9 @@ func (InstallController) genConfig(ctx echo.Context) error {
249249
config.ConfigFile.SetKeyComments("sensitive", "content", "内容关键词")
250250
config.ConfigFile.SetValue("sensitive", "content", "")
251251

252+
config.ConfigFile.SetSectionComments("search", "搜索配置")
253+
config.ConfigFile.SetValue("search", "engine_url", "")
254+
252255
// 校验数据库配置是否正确有效
253256
if err := db.TestDB(); err != nil {
254257
return err

websites/code2/studygolang/src/http/http.go

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -206,6 +206,7 @@ func executeTpl(ctx echo.Context, tpl *template.Template, data map[string]interf
206206
} else {
207207
data["wshost"] = global.App.Host + ":" + global.App.Port
208208
}
209+
global.App.SetUptime()
209210
data["app"] = global.App
210211

211212
buf := new(bytes.Buffer)

websites/code2/studygolang/src/logic/project.go

Lines changed: 181 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,14 +8,18 @@ package logic
88

99
import (
1010
"errors"
11+
"math/rand"
1112
"model"
1213
"net/url"
1314
"strconv"
1415
"strings"
1516
"time"
17+
"util"
1618

1719
. "db"
1820

21+
"github.com/PuerkitoBio/goquery"
22+
"github.com/lunny/html2md"
1923
"github.com/polaris1119/logger"
2024
"golang.org/x/net/context"
2125
)
@@ -212,6 +216,183 @@ func (ProjectLogic) getOwner(ctx context.Context, id int) int {
212216
return user.Uid
213217
}
214218

219+
// ParseProjectList 解析其他网站的开源项目
220+
func (self ProjectLogic) ParseProjectList(pUrl string) error {
221+
pUrl = strings.TrimSpace(pUrl)
222+
if !strings.HasPrefix(pUrl, "http") {
223+
pUrl = "http://" + pUrl
224+
}
225+
226+
var (
227+
doc *goquery.Document
228+
err error
229+
)
230+
231+
if doc, err = goquery.NewDocument(pUrl); err != nil {
232+
logger.Errorln("goquery opensource project newdocument error:", err)
233+
return err
234+
}
235+
236+
// 最后面的先入库处理
237+
projectsSelection := doc.Find(".ProjectList .List li")
238+
239+
for i := projectsSelection.Length() - 1; i >= 0; i-- {
240+
241+
contentSelection := goquery.NewDocumentFromNode(projectsSelection.Get(i)).Selection
242+
projectUrl, ok := contentSelection.Find("h3 a").Attr("href")
243+
244+
if !ok || projectUrl == "" {
245+
continue
246+
}
247+
err = self.ParseOneProject(projectUrl)
248+
249+
if err != nil {
250+
logger.Errorln(err)
251+
}
252+
}
253+
254+
return err
255+
}
256+
257+
const OsChinaDomain = "http://www.oschina.net"
258+
259+
// ProjectLogoPrefix 开源项目 logo 前缀
260+
const ProjectLogoPrefix = "plogo"
261+
262+
var PresetUsernames = []string{"polaris", "blov", "agolangf", "xuanbao"}
263+
264+
// ParseOneProject 处理单个 project
265+
func (ProjectLogic) ParseOneProject(projectUrl string) error {
266+
if !strings.HasPrefix(projectUrl, "http") {
267+
projectUrl = OsChinaDomain + projectUrl
268+
}
269+
270+
var (
271+
doc *goquery.Document
272+
err error
273+
)
274+
275+
// 加上 ?fromerr=xfwefs,否则页面有 js 重定向
276+
if doc, err = goquery.NewDocument(projectUrl + "?fromerr=xfwefs"); err != nil {
277+
return errors.New("goquery fetch " + projectUrl + " error:" + err.Error())
278+
}
279+
280+
// 标题
281+
category := strings.TrimSpace(doc.Find(".Project .name").Text())
282+
name := strings.TrimSpace(doc.Find(".Project .name u").Text())
283+
if category == "" && name == "" {
284+
return errors.New("projectUrl:" + projectUrl + " category and name are empty")
285+
}
286+
287+
tmpIndex := strings.LastIndex(category, name)
288+
if tmpIndex != -1 {
289+
category = category[:tmpIndex]
290+
}
291+
292+
// uri
293+
uri := projectUrl[strings.LastIndex(projectUrl, "/")+1:]
294+
295+
project := &model.OpenProject{}
296+
297+
_, err = MasterDB.Where("uri=?", uri).Get(project)
298+
// 已经存在
299+
if project.Id != 0 {
300+
return errors.New("url" + projectUrl + "has exists!")
301+
}
302+
303+
logoSelection := doc.Find(".Project .PN img")
304+
if logoSelection.AttrOr("title", "") != "" {
305+
project.Logo = logoSelection.AttrOr("src", "")
306+
307+
if !strings.HasPrefix(project.Logo, "http") {
308+
project.Logo = OsChinaDomain + project.Logo
309+
}
310+
311+
project.Logo, err = DefaultUploader.TransferUrl(nil, project.Logo, ProjectLogoPrefix)
312+
if err != nil {
313+
logger.Errorln("project logo upload error:", err)
314+
}
315+
}
316+
317+
// 获取项目相关链接
318+
doc.Find("#Body .urls li").Each(func(i int, liSelection *goquery.Selection) {
319+
aSelection := liSelection.Find("a")
320+
uri := util.FetchRealUrl(OsChinaDomain + aSelection.AttrOr("href", ""))
321+
switch aSelection.Text() {
322+
case "软件首页":
323+
project.Home = uri
324+
case "软件文档":
325+
project.Doc = uri
326+
case "软件下载":
327+
project.Download = uri
328+
}
329+
})
330+
331+
ctime := time.Now()
332+
doc.Find("#Body .attrs li").Each(func(i int, liSelection *goquery.Selection) {
333+
aSelection := liSelection.Find("a")
334+
txt := aSelection.Text()
335+
if i == 0 {
336+
project.Licence = txt
337+
if txt == "未知" {
338+
project.Licence = "其他"
339+
}
340+
} else if i == 1 {
341+
project.Lang = txt
342+
} else if i == 2 {
343+
project.Os = txt
344+
} else if i == 3 {
345+
dtime, err := time.ParseInLocation("2006年01月02日", aSelection.Last().Text(), time.Local)
346+
if err != nil {
347+
logger.Errorln("parse ctime error:", err)
348+
} else {
349+
ctime = dtime.Local()
350+
}
351+
}
352+
})
353+
354+
project.Name = name
355+
project.Category = category
356+
project.Uri = uri
357+
project.Repo = strings.TrimSpace(doc.Find("#Body .github-widget").AttrOr("data-repo", ""))
358+
project.Src = "https://github.com/" + project.Repo
359+
360+
pos := strings.Index(project.Repo, "/")
361+
if pos > -1 {
362+
project.Author = project.Repo[:pos]
363+
} else {
364+
project.Author = "网友"
365+
}
366+
367+
if project.Doc == "" {
368+
// TODO:暂时认为一定是 Go 语言
369+
project.Doc = "https://godoc.org/" + project.Src[8:]
370+
}
371+
372+
desc := ""
373+
doc.Find("#Body .detail").Find("p").NextAll().Each(func(i int, domSelection *goquery.Selection) {
374+
doc.FindSelection(domSelection).WrapHtml(`<div id="tmp` + strconv.Itoa(i) + `"></div>`)
375+
domHtml, _ := doc.Find("#tmp" + strconv.Itoa(i)).Html()
376+
if domSelection.Is("pre") {
377+
desc += domHtml + "\n\n"
378+
} else {
379+
desc += html2md.Convert(domHtml) + "\n\n"
380+
}
381+
})
382+
383+
project.Desc = strings.TrimSpace(desc)
384+
project.Username = PresetUsernames[rand.Intn(4)]
385+
project.Status = model.ProjectStatusOnline
386+
project.Ctime = model.OftenTime(ctime)
387+
388+
_, err = MasterDB.Insert(project)
389+
if err != nil {
390+
return errors.New("insert into open project error:" + err.Error())
391+
}
392+
393+
return nil
394+
}
395+
215396
// 项目评论
216397
type ProjectComment struct{}
217398

0 commit comments

Comments
 (0)