You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

57 lines
1.0 KiB

package util
import (
"unicode"
"github.com/yanyiwu/gojieba"
)
var jieba *gojieba.Jieba
type Jieba struct {
jieba *gojieba.Jieba
}
func NewJieba(dictDir ...string) *Jieba {
defaultDir := "dict"
if len(dictDir) > 0 {
defaultDir = dictDir[0]
}
dicts := []string{
defaultDir + "/jieba.dict.utf8",
defaultDir + "/hmm_model.utf8",
defaultDir + "/user.dict.utf8",
defaultDir + "/idf.utf8",
defaultDir + "/stop_words.utf8",
}
if jieba == nil {
jieba = gojieba.NewJieba(dicts...)
}
return &Jieba{
jieba: jieba,
}
}
func (j *Jieba) AddWord(words ...string) {
for _, word := range words {
j.jieba.AddWord(word)
}
}
func (j *Jieba) SegWords(text string, length ...int) (words []string) {
topk := 10
if len(length) > 0 {
topk = length[0]
}
wds := j.jieba.Extract(text, topk)
for _, wd := range wds {
// 不是标点且不是空格也不是数字
if unicode.IsSpace(rune(wd[0])) || unicode.IsPunct(rune(wd[0])) || unicode.IsDigit(rune(wd[0])) {
continue
}
words = append(words, wd)
}
return
}