|
| 1 | +// Copyright 2013 The StudyGolang Authors. All rights reserved. |
| 2 | +// Use of this source code is governed by a BSD-style |
| 3 | +// license that can be found in the LICENSE file. |
| 4 | +// http://studygolang.com, http://golang.top |
| 5 | +// Author:polaris [email protected] |
| 6 | + |
| 7 | +// 解析 http://www.reddit.com/r/golang 最新 Go 信息 |
| 8 | +package service |
| 9 | + |
| 10 | +import ( |
| 11 | + "errors" |
| 12 | + "math/rand" |
| 13 | + "regexp" |
| 14 | + "strings" |
| 15 | + "time" |
| 16 | + |
| 17 | + "github.com/PuerkitoBio/goquery" |
| 18 | + "logger" |
| 19 | + "model" |
| 20 | + "util" |
| 21 | +) |
| 22 | + |
| 23 | +const ( |
| 24 | + Reddit = "http://www.reddit.com" |
| 25 | + RedditGolang = "/r/golang" |
| 26 | +) |
| 27 | + |
| 28 | +// 获取url对应的文章并根据规则进行解析 |
| 29 | +func ParseReddit(redditUrl string) error { |
| 30 | + redditUrl = strings.TrimSpace(redditUrl) |
| 31 | + if redditUrl == "" { |
| 32 | + redditUrl = Reddit + RedditGolang |
| 33 | + } else if !strings.HasPrefix(redditUrl, "http") { |
| 34 | + redditUrl = "http://" + redditUrl |
| 35 | + } |
| 36 | + |
| 37 | + var ( |
| 38 | + doc *goquery.Document |
| 39 | + err error |
| 40 | + ) |
| 41 | + |
| 42 | + if doc, err = goquery.NewDocument(redditUrl); err != nil { |
| 43 | + logger.Errorln("goquery reddit newdocument error:", err) |
| 44 | + return err |
| 45 | + } |
| 46 | + |
| 47 | + doc.Find("#siteTable .link").Each(func(i int, contentSelection *goquery.Selection) { |
| 48 | + |
| 49 | + err = dealRedditOneResource(contentSelection) |
| 50 | + |
| 51 | + if err != nil { |
| 52 | + logger.Errorln(err) |
| 53 | + } |
| 54 | + }) |
| 55 | + |
| 56 | + return err |
| 57 | +} |
| 58 | + |
| 59 | +var PresetUids = []int{1, 1747, 1748, 1827} |
| 60 | + |
| 61 | +var resourceRe = regexp.MustCompile(`\n\n`) |
| 62 | + |
| 63 | +// 处理 Reddit 中的一条资源 |
| 64 | +func dealRedditOneResource(contentSelection *goquery.Selection) error { |
| 65 | + aSelection := contentSelection.Find(".title a.title") |
| 66 | + |
| 67 | + title := aSelection.Text() |
| 68 | + if title == "" { |
| 69 | + return errors.New("title is empty") |
| 70 | + } |
| 71 | + |
| 72 | + resourceUrl, ok := aSelection.Attr("href") |
| 73 | + if !ok || resourceUrl == "" { |
| 74 | + return errors.New("resource url is empty") |
| 75 | + } |
| 76 | + |
| 77 | + resource := model.NewResource() |
| 78 | + // Reddit 自身的内容 |
| 79 | + if contentSelection.HasClass("self") { |
| 80 | + resourceUrl = Reddit + resourceUrl |
| 81 | + } |
| 82 | + |
| 83 | + err := resource.Where("url=?", resourceUrl).Find("id") |
| 84 | + // 已经存在 |
| 85 | + if resource.Id != 0 { |
| 86 | + return errors.New("url" + resourceUrl + "has exists!") |
| 87 | + } |
| 88 | + |
| 89 | + if contentSelection.HasClass("self") { |
| 90 | + |
| 91 | + resource.Form = model.ContentForm |
| 92 | + |
| 93 | + var doc *goquery.Document |
| 94 | + |
| 95 | + if doc, err = goquery.NewDocument(resourceUrl); err != nil { |
| 96 | + return errors.New("goquery reddit.com/r/golang self newdocument error:" + err.Error()) |
| 97 | + } |
| 98 | + |
| 99 | + content, err := doc.Find("#siteTable .usertext .md").Html() |
| 100 | + if err != nil { |
| 101 | + return err |
| 102 | + } |
| 103 | + |
| 104 | + doc.Find(".commentarea .comment .usertext .md").Each(func(i int, contentSel *goquery.Selection) { |
| 105 | + if i == 0 { |
| 106 | + content += `<hr/>**评论:**<br/><br/>` |
| 107 | + } |
| 108 | + |
| 109 | + comment, err := contentSel.Html() |
| 110 | + if err != nil { |
| 111 | + return |
| 112 | + } |
| 113 | + |
| 114 | + comment = strings.TrimSpace(comment) |
| 115 | + comment = resourceRe.ReplaceAllLiteralString(comment, "\n") |
| 116 | + |
| 117 | + author := contentSel.ParentsFiltered(".usertext").Prev().Find(".author").Text() |
| 118 | + content += author + ": <pre>" + comment + "</pre>" |
| 119 | + }) |
| 120 | + |
| 121 | + resource.Content = content |
| 122 | + |
| 123 | + // reddit 本身的,当做其他资源 |
| 124 | + resource.Catid = 4 |
| 125 | + } else { |
| 126 | + resource.Form = model.LinkForm |
| 127 | + |
| 128 | + // Github,是开源项目 |
| 129 | + if contentSelection.Find(".title .domain a").Text() == "github.com" { |
| 130 | + resource.Catid = 2 |
| 131 | + } else { |
| 132 | + resource.Catid = 1 |
| 133 | + } |
| 134 | + } |
| 135 | + |
| 136 | + resource.Title = title |
| 137 | + resource.Url = resourceUrl |
| 138 | + resource.Uid = PresetUids[rand.Intn(4)] |
| 139 | + |
| 140 | + ctime := util.TimeNow() |
| 141 | + datetime, ok := contentSelection.Find(".tagline time").Attr("datetime") |
| 142 | + if ok { |
| 143 | + dtime, err := time.Parse(time.RFC3339, datetime) |
| 144 | + if err != nil { |
| 145 | + logger.Errorln("parse ctime error:", err) |
| 146 | + } else { |
| 147 | + ctime = dtime.Format("2006-01-02 15:04:05") |
| 148 | + } |
| 149 | + } |
| 150 | + resource.Ctime = ctime |
| 151 | + |
| 152 | + var id int64 |
| 153 | + id, err = resource.Insert() |
| 154 | + |
| 155 | + if err != nil { |
| 156 | + return errors.New("insert into Resource error:" + err.Error()) |
| 157 | + } |
| 158 | + |
| 159 | + // 存扩展信息 |
| 160 | + resourceEx := model.NewResourceEx() |
| 161 | + resourceEx.Id = int(id) |
| 162 | + if _, err = resourceEx.Insert(); err != nil { |
| 163 | + return errors.New("insert into ResourceEx error:" + err.Error()) |
| 164 | + } |
| 165 | + |
| 166 | + return nil |
| 167 | +} |
0 commit comments