@@ -8,13 +8,18 @@ package service
88
99import (
1010 "errors"
11+ "math/rand"
1112 "net/url"
1213 "strconv"
1314 "strings"
15+ "time"
1416
1517 "logger"
1618 "model"
1719 "util"
20+
21+ "github.com/PuerkitoBio/goquery"
22+ "github.com/lunny/html2md"
1823)
1924
2025func PublishProject (user map [string ]interface {}, form url.Values ) (err error ) {
@@ -211,6 +216,156 @@ func ProjectsTotal() (total int) {
211216 return
212217}
213218
219+ // ParseProjectList 解析其他网站的开源项目
220+ func ParseProjectList (pUrl string ) error {
221+ pUrl = strings .TrimSpace (pUrl )
222+ if ! strings .HasPrefix (pUrl , "http" ) {
223+ pUrl = "http://" + pUrl
224+ }
225+
226+ var (
227+ doc * goquery.Document
228+ err error
229+ )
230+
231+ if doc , err = goquery .NewDocument (pUrl ); err != nil {
232+ logger .Errorln ("goquery opensource project newdocument error:" , err )
233+ return err
234+ }
235+
236+ // 最后面的先入库处理
237+ projectsSelection := doc .Find (".ProjectList .List li" )
238+
239+ for i := projectsSelection .Length () - 1 ; i >= 0 ; i -- {
240+
241+ contentSelection := goquery .NewDocumentFromNode (projectsSelection .Get (i )).Selection
242+ projectUrl , ok := contentSelection .Find ("h3 a" ).Attr ("href" )
243+
244+ if ! ok || projectUrl == "" {
245+ continue
246+ }
247+ err = ParseOneProject (projectUrl )
248+
249+ if err != nil {
250+ logger .Errorln (err )
251+ }
252+ }
253+
254+ return err
255+ }
256+
257+ const OsChinaDomain = "http://www.oschina.net"
258+
259+ var PresetUsernames = []string {"polaris" , "blov" , "agolangf" , "xuanbao" }
260+
261+ // ParseOneProject 处理单个 project
262+ func ParseOneProject (projectUrl string ) error {
263+ if ! strings .HasPrefix (projectUrl , "http" ) {
264+ projectUrl = OsChinaDomain + projectUrl
265+ }
266+
267+ var (
268+ doc * goquery.Document
269+ err error
270+ )
271+
272+ if doc , err = goquery .NewDocument (projectUrl ); err != nil {
273+ return errors .New ("goquery fetch " + projectUrl + " error:" + err .Error ())
274+ }
275+
276+ // 标题
277+ category := strings .TrimSpace (doc .Find (".Project .name" ).Text ())
278+ name := strings .TrimSpace (doc .Find (".Project .name u" ).Text ())
279+ tmpIndex := strings .LastIndex (category , name )
280+ if tmpIndex != - 1 {
281+ category = category [:tmpIndex ]
282+ }
283+
284+ // uri
285+ uri := projectUrl [strings .LastIndex (projectUrl , "/" )+ 1 :]
286+
287+ project := model .NewOpenProject ()
288+
289+ err = project .Where ("uri=?" , uri ).Find ("id" )
290+ // 已经存在
291+ if project .Id != 0 {
292+ return errors .New ("url" + projectUrl + "has exists!" )
293+ }
294+
295+ // 获取项目相关链接
296+ doc .Find ("#Body .urls li" ).Each (func (i int , liSelection * goquery.Selection ) {
297+ aSelection := liSelection .Find ("a" )
298+ uri := util .FetchRealUrl (OsChinaDomain + aSelection .AttrOr ("href" , "" ))
299+ switch aSelection .Text () {
300+ case "软件首页" :
301+ project .Home = uri
302+ case "软件文档" :
303+ project .Doc = uri
304+ case "软件下载" :
305+ project .Download = uri
306+ }
307+ })
308+
309+ ctime := util .TimeNow ()
310+
311+ doc .Find ("#Body .attrs li" ).Each (func (i int , liSelection * goquery.Selection ) {
312+ aSelection := liSelection .Find ("a" )
313+ txt := aSelection .Text ()
314+ if i == 0 {
315+ project .Licence = txt
316+ if txt == "未知" {
317+ project .Licence = "其他"
318+ }
319+ } else if i == 1 {
320+ project .Lang = txt
321+ } else if i == 2 {
322+ project .Os = txt
323+ } else if i == 3 {
324+ dtime , err := time .ParseInLocation ("2006年01月02日" , aSelection .Last ().Text (), time .Local )
325+ if err != nil {
326+ logger .Errorln ("parse ctime error:" , err )
327+ } else {
328+ ctime = dtime .Local ().Format ("2006-01-02 15:04:05" )
329+ }
330+ }
331+ })
332+
333+ project .Name = name
334+ project .Category = category
335+ project .Uri = uri
336+ project .Repo = strings .TrimSpace (doc .Find ("#Body .github-widget" ).AttrOr ("data-repo" , "" ))
337+ project .Src = "https://github.com/" + project .Repo
338+ project .Author = project .Repo [:strings .Index (project .Repo , "/" )]
339+
340+ if project .Doc == "" {
341+ // TODO:暂时认为一定是 Go 语言
342+ project .Doc = "https://godoc.org/" + project .Src
343+ }
344+
345+ desc := ""
346+ doc .Find ("#Body .detail" ).Find ("p" ).NextAll ().Each (func (i int , domSelection * goquery.Selection ) {
347+ doc .FindSelection (domSelection ).WrapHtml (`<div id="tmp` + strconv .Itoa (i ) + `"></div>` )
348+ domHtml , _ := doc .Find ("#tmp" + strconv .Itoa (i )).Html ()
349+ if domSelection .Is ("pre" ) {
350+ desc += domHtml + "\n \n "
351+ } else {
352+ desc += html2md .Convert (domHtml ) + "\n \n "
353+ }
354+ })
355+
356+ project .Desc = strings .TrimSpace (desc )
357+ project .Username = PresetUsernames [rand .Intn (4 )]
358+ project .Status = model .ProjectStatusOnline
359+ project .Ctime = ctime
360+
361+ _ , err = project .Insert ()
362+ if err != nil {
363+ return errors .New ("insert into open project error:" + err .Error ())
364+ }
365+
366+ return nil
367+ }
368+
214369// 通过objid获得 project 的所有者
215370func getProjectOwner (id int ) int {
216371 project := model .NewOpenProject ()
0 commit comments