Thanks to visit codestin.com
Credit goes to github.com

Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 7 additions & 3 deletions dict_util.go
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ const (
zhT1 = "dict/zh/t_1.txt"
)

// Init init seg config
// Init initializes the segmenter config
func (seg *Segmenter) Init() {
if seg.MinTokenFreq == 0 {
seg.MinTokenFreq = 2.0
Expand All @@ -47,6 +47,10 @@ func (seg *Segmenter) Init() {
if seg.TextFreq == "" {
seg.TextFreq = "2.0"
}

if !seg.NotLoadHMM {
seg.LoadModel()
}
}

// Dictionary returns the dictionary used by the tokenizer
Expand All @@ -66,7 +70,7 @@ func (seg *Segmenter) ToToken(text string, freq float64, pos ...string) Token {
return token
}

// AddToken add new text to token
// AddToken add a new text to the token
func (seg *Segmenter) AddToken(text string, freq float64, pos ...string) error {
token := seg.ToToken(text, freq, pos...)
return seg.Dict.AddToken(token)
Expand Down Expand Up @@ -364,7 +368,7 @@ func (seg *Segmenter) Reader(reader *bufio.Reader, files ...string) error {
pos = ""
}

// Add participle tokens to dictionary
// Add participle tokens to the dictionary
words := seg.SplitTextToWords([]byte(text))
token := Token{text: words, freq: freq, pos: pos}
seg.Dict.AddToken(token)
Expand Down
30 changes: 15 additions & 15 deletions dictionary.go
Original file line number Diff line number Diff line change
Expand Up @@ -17,37 +17,37 @@ import (
"github.com/vcaesar/cedar"
)

// Dictionary 结构体实现了一个字串双数组树,
// 一个分词可能出现在叶子节点也有可能出现在非叶节点
// Dictionary struct implements a string double array trie.
// one segment maybe in leaf node or not
type Dictionary struct {
trie *cedar.Cedar // Cedar 双数组树
trie *cedar.Cedar // Cedar double array trie

maxTokenLen int // 词典中最长的分词
Tokens []Token // 词典中所有的分词,方便遍历
totalFreq float64 // 词典中所有分词的频率之和
maxTokenLen int // the maximum length of the dictionary
Tokens []Token // the all tokens in the dictionary, to traverse
totalFreq float64 // the total number of tokens in the dictionary
}

// NewDict new dictionary
// NewDict a new dictionary trie
func NewDict() *Dictionary {
return &Dictionary{trie: cedar.New()}
}

// MaxTokenLen 词典中最长的分词
// MaxTokenLen the maximum length of the dictionary
func (dict *Dictionary) MaxTokenLen() int {
return dict.maxTokenLen
}

// NumTokens 词典中分词数目
// NumTokens the number of tokens in the dictionary
func (dict *Dictionary) NumTokens() int {
return len(dict.Tokens)
}

// TotalFreq 词典中所有分词的频率之和
// TotalFreq the total frequency of the dictionary
func (dict *Dictionary) TotalFreq() float64 {
return dict.totalFreq
}

// AddToken 向词典中加入一个分词
// AddToken add a token to the dictionary
func (dict *Dictionary) AddToken(token Token) error {
bytes := textSliceToBytes(token.text)
val, err := dict.trie.Get(bytes)
Expand Down Expand Up @@ -77,8 +77,8 @@ func (dict *Dictionary) RemoveToken(token Token) error {
return dict.trie.Delete(bytes)
}

// LookupTokens 在词典中查找和字元组 words 可以前缀匹配的所有分词
// 返回值为找到的分词数
// LookupTokens finds tokens and words in the dictionary, matching the given pattern
// and returns the number of tokens
func (dict *Dictionary) LookupTokens(
words []Text, tokens []*Token) (numOfTokens int) {
var (
Expand All @@ -103,7 +103,7 @@ func (dict *Dictionary) LookupTokens(
}

// Find find the word in the dictionary is non-existent
// and the word's frequency, pos
// and the word's frequency and pos
func (dict *Dictionary) Find(word []byte) (float64, string, bool) {
var (
id, value int
Expand Down Expand Up @@ -131,7 +131,7 @@ func (dict *Dictionary) Find(word []byte) (float64, string, bool) {
}

// Value find word in the dictionary
// retrun the word's value, id
// retrun the word's value and id
func (dict *Dictionary) Value(word []byte) (val, id int, err error) {
id, err = dict.trie.Jump(word, id)
if err != nil {
Expand Down
12 changes: 4 additions & 8 deletions gse.go
Original file line number Diff line number Diff line change
Expand Up @@ -30,21 +30,17 @@ const (
// minTokenFrequency = 2 // only read tokens with frequency >= 2 from the dictionary
)

func init() {
hmm.LoadModel()
}

// GetVersion get the gse version
// GetVersion get the version of gse
func GetVersion() string {
return Version
}

// Prob type hmm model struct
// Prob define the hmm model struct
type Prob struct {
B, E, M, S map[rune]float64
}

// New return new gse segmenter
// New return a new gse segmenter
func New(files ...string) (seg Segmenter, err error) {
if len(files) > 1 && files[1] == "alpha" {
seg.AlphaNum = true
Expand Down Expand Up @@ -124,7 +120,7 @@ func (seg *Segmenter) CutStr(str []string, separator ...string) (r string) {
return
}

// LoadModel load the hmm model
// LoadModel load the hmm model (default is Chinese char)
//
// Use the user's model:
//
Expand Down
11 changes: 6 additions & 5 deletions gse_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -39,15 +39,15 @@ func TestLoadDictMap(t *testing.T) {
}

func TestAnalyze(t *testing.T) {
txt := `城市地标建筑: 纽约帝国大厦, 旧金山湾金门大桥, Seattle Space Needle, Toronto CN Tower, 伦敦大笨钟`
txt := `城市地标建筑: 纽约帝国大厦, 旧金山湾金门大桥, Seattle Space Needle; Toronto CN Tower, 伦敦大笨钟`

s := prodSeg.Cut(txt, true)
tt.Equal(t, 23, len(s))
tt.Equal(t, "[城市地标 建筑 : 纽约 帝国大厦 , 旧金山湾 金门大桥 , seattle space needle , toronto cn tower , 伦敦 大笨钟]", s)
tt.Equal(t, "[城市地标 建筑 : 纽约 帝国大厦 , 旧金山湾 金门大桥 , seattle space needle ; toronto cn tower , 伦敦 大笨钟]", s)

a := prodSeg.Analyze(s, "", true)
tt.Equal(t, 23, len(a))
tt.Equal(t, "[{0 4 0 0 城市地标 3 j} {4 6 1 0 建筑 14397 n} {6 8 2 0 : 0 } {8 10 3 0 纽约 1758 ns} {10 14 4 0 帝国大厦 3 nr} {14 16 5 0 , 0 } {16 20 6 0 旧金山湾 3 ns} {20 24 7 0 金门大桥 38 nz} {24 26 8 0 , 0 } {26 33 9 0 seattle 0 } {33 34 10 0 0 } {34 39 11 0 space 0 } {39 40 12 0 0 } {40 46 13 0 needle 0 } {46 48 14 0 , 0 } {48 55 15 0 toronto 0 } {55 56 16 0 0 } {56 58 17 0 cn 0 } {58 59 18 0 0 } {59 64 19 0 tower 0 } {64 66 20 0 , 0 } {66 68 21 0 伦敦 2255 ns} {68 71 22 0 大笨钟 0 }]", a)
tt.Equal(t, "[{0 4 0 0 城市地标 3 j} {4 6 1 0 建筑 14397 n} {6 8 2 0 : 0 } {8 10 3 0 纽约 1758 ns} {10 14 4 0 帝国大厦 3 nr} {14 16 5 0 , 0 } {16 20 6 0 旧金山湾 3 ns} {20 24 7 0 金门大桥 38 nz} {24 26 8 0 , 0 } {26 33 9 0 seattle 0 } {33 34 10 0 0 } {34 39 11 0 space 0 } {39 40 12 0 0 } {40 46 13 0 needle 0 } {46 48 14 0 ; 0 } {48 55 15 0 toronto 0 } {55 56 16 0 0 } {56 58 17 0 cn 0 } {58 59 18 0 0 } {59 64 19 0 tower 0 } {64 66 20 0 , 0 } {66 68 21 0 伦敦 2255 ns} {68 71 22 0 大笨钟 0 }]", a)

tt.Equal(t, 0, a[0].Start)
tt.Equal(t, 4, a[0].End)
Expand All @@ -59,11 +59,11 @@ func TestAnalyze(t *testing.T) {

s = prodSeg.CutSearch(txt, true)
tt.Equal(t, 34, len(s))
tt.Equal(t, "[城市 市地 地标 城市地标 建筑 : 纽约 帝国 国大 大厦 帝国大厦 , 金山 山湾 旧金山 旧金山湾 金门 大桥 金门大桥 , seattle space needle , toronto cn tower , 伦敦 大笨钟]", s)
tt.Equal(t, "[城市 市地 地标 城市地标 建筑 : 纽约 帝国 国大 大厦 帝国大厦 , 金山 山湾 旧金山 旧金山湾 金门 大桥 金门大桥 , seattle space needle ; toronto cn tower , 伦敦 大笨钟]", s)

a = prodSeg.Analyze(s, txt)
tt.Equal(t, 34, len(a))
tt.Equal(t, "[{0 6 0 0 城市 25084 ns} {3 9 1 0 市地 11 n} {6 12 2 0 地标 32 n} {0 12 3 0 城市地标 3 j} {12 18 4 0 建筑 14397 n} {18 20 5 0 : 0 } {20 26 6 0 纽约 1758 ns} {26 32 7 0 帝国 3655 n} {29 35 8 0 国大 114 j} {32 38 9 0 大厦 777 n} {26 38 10 0 帝国大厦 3 nr} {104 106 11 0 , 0 } {43 49 12 0 金山 291 nr} {46 52 13 0 山湾 7 ns} {40 49 14 0 旧金山 238 ns} {40 52 15 0 旧金山湾 3 ns} {52 58 16 0 金门 149 n} {58 64 17 0 大桥 3288 ns} {52 64 18 0 金门大桥 38 nz} {86 88 19 0 , 0 } {66 73 20 0 seattle 0 } {105 106 21 0 0 } {74 79 22 0 space 0 } {98 99 23 0 0 } {80 86 24 0 needle 0 } {64 66 25 0 , 0 } {88 95 26 0 toronto 0 } {95 96 27 0 0 } {96 98 28 0 cn 0 } {87 88 29 0 0 } {99 104 30 0 tower 0 } {38 40 31 0 , 0 } {106 112 32 0 伦敦 2255 ns} {112 121 33 0 大笨钟 0 }]", a)
tt.Equal(t, "[{0 6 0 0 城市 25084 ns} {3 9 1 0 市地 11 n} {6 12 2 0 地标 32 n} {0 12 3 0 城市地标 3 j} {12 18 4 0 建筑 14397 n} {18 20 5 0 : 0 } {20 26 6 0 纽约 1758 ns} {26 32 7 0 帝国 3655 n} {29 35 8 0 国大 114 j} {32 38 9 0 大厦 777 n} {26 38 10 0 帝国大厦 3 nr} {104 106 11 0 , 0 } {43 49 12 0 金山 291 nr} {46 52 13 0 山湾 7 ns} {40 49 14 0 旧金山 238 ns} {40 52 15 0 旧金山湾 3 ns} {52 58 16 0 金门 149 n} {58 64 17 0 大桥 3288 ns} {52 64 18 0 金门大桥 38 nz} {64 66 19 0 , 0 } {66 73 20 0 seattle 0 } {105 106 21 0 0 } {74 79 22 0 space 0 } {98 99 23 0 0 } {80 86 24 0 needle 0 } {86 88 25 0 ; 0 } {88 95 26 0 toronto 0 } {95 96 27 0 0 } {96 98 28 0 cn 0 } {87 88 29 0 0 } {99 104 30 0 tower 0 } {38 40 31 0 , 0 } {106 112 32 0 伦敦 2255 ns} {112 121 33 0 大笨钟 0 }]", a)
}

func TestHMM(t *testing.T) {
Expand Down Expand Up @@ -313,6 +313,7 @@ func TestUrl(t *testing.T) {
func TestLoadDictSep(t *testing.T) {
var seg1 Segmenter
seg1.DictSep = ","
seg1.NotLoadHMM = true
err := seg1.LoadDict("./testdata/test_en.txt")
tt.Nil(t, err)

Expand Down
31 changes: 17 additions & 14 deletions segmenter.go
Original file line number Diff line number Diff line change
Expand Up @@ -20,12 +20,15 @@ import (
"unicode/utf8"
)

// Segmenter 分词器结构体
// Segmenter define the segmenter structure
type Segmenter struct {
Dict *Dictionary
Load bool
DictSep string

// NotLoadHMM option load the default hmm model config (Chinese char)
NotLoadHMM bool

// AlphaNum set splitTextToWords can add token
// when words in alphanum
// set up alphanum dictionary word segmentation
Expand Down Expand Up @@ -60,15 +63,15 @@ type jumper struct {
token *Token
}

// Segment 对文本分词
// Segment use shortest path to segment the text
//
// 输入参数
// input parameter
//
// bytes UTF8 文本的字节数组
// bytes UTF8 text []byte
//
// 输出
// output
//
// []Segment 划分的分词
// []Segment retrun segments result
func (seg *Segmenter) Segment(bytes []byte) []Segment {
return seg.internalSegment(bytes, false)
}
Expand All @@ -84,13 +87,13 @@ func (seg *Segmenter) ModeSegment(bytes []byte, searchMode ...bool) []Segment {
}

func (seg *Segmenter) internalSegment(bytes []byte, searchMode bool) []Segment {
// 处理特殊情况
// specific case
if len(bytes) == 0 {
// return []Segment{}
return nil
}

// 划分字元
// split text to words
text := seg.SplitTextToWords(bytes)

return seg.segmentWords(text, searchMode)
Expand Down Expand Up @@ -182,13 +185,13 @@ func updateJumper(jumper *jumper, baseDistance float32, token *Token) {
}
}

// SplitWords 将文本划分成字元
// SplitWords splits a string to token words
func SplitWords(text Text) []Text {
var seg Segmenter
return seg.SplitTextToWords(text)
}

// SplitTextToWords 将文本划分成字元
// SplitTextToWords splits a string to token words
func (seg *Segmenter) SplitTextToWords(text Text) []Text {
output := make([]Text, 0, len(text)/3)
current, alphanumericStart := 0, 0
Expand Down Expand Up @@ -221,7 +224,7 @@ func (seg *Segmenter) SplitTextToWords(text Text) []Text {
current += size
}

// 处理最后一个字元是英文的情况
// procsss last byte is alpha and num
if inAlphanumeric && !seg.AlphaNum {
if current != 0 {
output = append(output, toLow(text[alphanumericStart:current]))
Expand All @@ -239,7 +242,7 @@ func toLow(text []byte) []byte {
return text
}

// toLower 将英文词转化为小写
// toLower converts a string to lower
func toLower(text []byte) []byte {
output := make([]byte, len(text))
for i, t := range text {
Expand All @@ -253,15 +256,15 @@ func toLower(text []byte) []byte {
return output
}

// minInt 取两整数较小值
// minInt get min value of int
func minInt(a, b int) int {
if a > b {
return b
}
return a
}

// maxInt 取两整数较大值
// maxInt get max value of int
func maxInt(a, b int) int {
if a > b {
return a
Expand Down