Thanks to visit codestin.com
Credit goes to github.com

Skip to content

Commit 182595d

Browse files
committed
过滤垃圾文章:包含很多链接
1 parent af42ebe commit 182595d

File tree

1 file changed

+6
-1
lines changed

1 file changed

+6
-1
lines changed

src/logic/article.go

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -149,10 +149,15 @@ func (ArticleLogic) ParseArticle(ctx context.Context, articleUrl string, auto bo
149149

150150
// 自动抓取,内容长度不能少于 300 字
151151
if auto && len(txt) < 300 {
152-
logger.Infoln(articleUrl, "content is short")
152+
logger.Errorln(articleUrl, "content is short")
153153
return nil, errors.New("content is short")
154154
}
155155

156+
if auto && strings.Count(txt, "http://") > 10 {
157+
logger.Errorln(articleUrl, "content contains too many link!")
158+
return nil, errors.New("content contains too many link")
159+
}
160+
156161
pubDate := times.Format("Y-m-d H:i:s")
157162
if rule.PubDate != "" {
158163
pubDate = strings.TrimSpace(doc.Find(rule.PubDate).First().Text())

0 commit comments

Comments
 (0)