替换结巴分词

dev
truthhun 1 year ago
parent 4630aa1949
commit 6a115bada2

@ -14,6 +14,7 @@ import (
"moredoc/model"
"moredoc/util"
"moredoc/util/filetil"
"moredoc/util/gse"
"github.com/golang-jwt/jwt"
"go.uber.org/zap"
@ -79,7 +80,6 @@ func (s *DocumentAPIService) CreateDocument(ctx context.Context, req *pb.CreateD
var (
documents []model.Document
docMapAttachment = make(map[int]int64)
jieba = util.NewJieba()
)
for idx, doc := range req.Document {
attachment, ok := attachmentMap[doc.AttachmentId]
@ -89,7 +89,7 @@ func (s *DocumentAPIService) CreateDocument(ctx context.Context, req *pb.CreateD
doc := model.Document{
Title: doc.Title,
Keywords: strings.Join(jieba.SegWords(doc.Title, 10), ","),
Keywords: strings.Join(gse.SegWords(doc.Title), ","),
UserId: userCliams.UserId,
// UUID: uuid.Must(uuid.NewV4()).String(),
Score: 300,

@ -7,6 +7,7 @@ require (
github.com/gin-contrib/cors v1.3.1
github.com/gin-contrib/gzip v0.0.5
github.com/gin-gonic/gin v1.8.1
github.com/go-ego/gse v0.80.2
github.com/gogo/protobuf v1.3.2
github.com/golang/protobuf v1.5.2
github.com/grpc-ecosystem/go-grpc-middleware v1.3.0
@ -15,12 +16,12 @@ require (
github.com/mitchellh/go-homedir v1.1.0
github.com/spf13/cobra v1.3.0
github.com/spf13/viper v1.10.1
github.com/yanyiwu/gojieba v1.2.0
go.uber.org/zap v1.21.0
golang.org/x/net v0.1.0
google.golang.org/genproto v0.0.0-20220228195345-15d65a4533f7
google.golang.org/grpc v1.44.0
google.golang.org/protobuf v1.28.0
gopkg.in/gomail.v2 v2.0.0-20160411212932-81ebce5c23df
gorm.io/driver/mysql v1.3.2
gorm.io/gorm v1.23.2
)
@ -30,9 +31,9 @@ require (
github.com/goccy/go-json v0.9.7 // indirect
github.com/golang/freetype v0.0.0-20170609003504-e2365dfdc4a0 // indirect
github.com/pelletier/go-toml/v2 v2.0.1 // indirect
github.com/vcaesar/cedar v0.20.1 // indirect
golang.org/x/image v0.0.0-20191009234506-e7c1f5e7dbb8 // indirect
gopkg.in/alexcesaro/quotedprintable.v3 v3.0.0-20150716171945-2caba252f4dc // indirect
gopkg.in/gomail.v2 v2.0.0-20160411212932-81ebce5c23df // indirect
)
require (

@ -133,6 +133,8 @@ github.com/gin-gonic/gin v1.6.3/go.mod h1:75u5sXoLsGZoRN5Sgbi1eraJ4GU3++wFwWzhwv
github.com/gin-gonic/gin v1.7.4/go.mod h1:jD2toBW3GZUr5UMcdrwQA10I7RuaFOl/SGeDjXkfUtY=
github.com/gin-gonic/gin v1.8.1 h1:4+fr/el88TOO3ewCmQr8cx/CtZ/umlIRIs5M4NTNjf8=
github.com/gin-gonic/gin v1.8.1/go.mod h1:ji8BvRH1azfM+SYow9zQ6SZMvR8qOMZHmsCuWR9tTTk=
github.com/go-ego/gse v0.80.2 h1:3LRfkaBuwlsHsmkOZvnhTcsYPXUAhiP06Sqcid7mO1M=
github.com/go-ego/gse v0.80.2/go.mod h1:kesekpZfcFQ/kwd9b27VZHUOH5dQUjaaQUZ4OGt4Hj4=
github.com/go-gl/glfw v0.0.0-20190409004039-e6da0acd62b1/go.mod h1:vR7hzQXu2zJy9AVAgeJqvqgH9Q5CA+iKCZ2gyEVpxRU=
github.com/go-gl/glfw/v3.3/glfw v0.0.0-20191125211704-12ad95a8df72/go.mod h1:tQ2UAYgL5IevRw8kRxooKSPJfGvJ9fJQFa0TUsXzTg8=
github.com/go-gl/glfw/v3.3/glfw v0.0.0-20200222043503-6f7a984d4dc4/go.mod h1:tQ2UAYgL5IevRw8kRxooKSPJfGvJ9fJQFa0TUsXzTg8=
@ -424,8 +426,9 @@ github.com/ugorji/go v1.2.7/go.mod h1:nF9osbDWLy6bDVv/Rtoh6QgnvNDpmCalQV5urGCCS6
github.com/ugorji/go/codec v1.1.7/go.mod h1:Ax+UKWsSmolVDwsd+7N3ZtXu+yMGCf907BLYF3GoBXY=
github.com/ugorji/go/codec v1.2.7 h1:YPXUKf7fYbp/y8xloBqZOw2qaVggbfwMlI8WM3wZUJ0=
github.com/ugorji/go/codec v1.2.7/go.mod h1:WGN1fab3R1fzQlVQTkfxVtIBhWDRqOviHU95kRgeqEY=
github.com/yanyiwu/gojieba v1.2.0 h1:axzRPH9jZwMc8rsXUjRuQ0lMjCrbZ6haz38pPtJ8c5M=
github.com/yanyiwu/gojieba v1.2.0/go.mod h1:54wkP7sMJ6bklf7yPl6F+JG71dzVUU1WigZbR47nGdY=
github.com/vcaesar/cedar v0.20.1 h1:cDOmYWdprO7ZW8cngJrDi8Zivnscj9dA/y8Y+2SB1P0=
github.com/vcaesar/cedar v0.20.1/go.mod h1:iMDweyuW76RvSrCkQeZeQk4iCbshiPzcCvcGCtpM7iI=
github.com/vcaesar/tt v0.20.0 h1:9t2Ycb9RNHcP0WgQgIaRKJBB+FrRdejuaL6uWIHuoBA=
github.com/yuin/goldmark v1.1.25/go.mod h1:3hX8gzYuyVAZsxl0MRgGTJEmQBFcNTphYh9decYSb74=
github.com/yuin/goldmark v1.1.27/go.mod h1:3hX8gzYuyVAZsxl0MRgGTJEmQBFcNTphYh9decYSb74=
github.com/yuin/goldmark v1.1.32/go.mod h1:3hX8gzYuyVAZsxl0MRgGTJEmQBFcNTphYh9decYSb74=

@ -0,0 +1,37 @@
package gse
import (
"unicode"
"unicode/utf8"
"github.com/go-ego/gse"
"github.com/go-ego/gse/hmm/pos"
)
var (
seg gse.Segmenter
posSeg pos.Segmenter
)
func init() {
err := seg.LoadDictEmbed()
if err != nil {
panic(err)
}
err = seg.LoadStopEmbed()
if err != nil {
panic(err)
}
}
func SegWords(text string) (words []string) {
wds := seg.Cut(text)
for _, wd := range wds {
// 跳过单字、空格、标点、数字
if utf8.RuneCountInString(wd) == 1 || unicode.IsSpace(rune(wd[0])) || unicode.IsPunct(rune(wd[0])) || unicode.IsDigit(rune(wd[0])) {
continue
}
words = append(words, wd)
}
return
}

@ -1,56 +0,0 @@
package util
import (
"unicode"
"github.com/yanyiwu/gojieba"
)
var jieba *gojieba.Jieba
type Jieba struct {
jieba *gojieba.Jieba
}
func NewJieba(dictDir ...string) *Jieba {
defaultDir := "dict"
if len(dictDir) > 0 {
defaultDir = dictDir[0]
}
dicts := []string{
defaultDir + "/jieba.dict.utf8",
defaultDir + "/hmm_model.utf8",
defaultDir + "/user.dict.utf8",
defaultDir + "/idf.utf8",
defaultDir + "/stop_words.utf8",
}
if jieba == nil {
jieba = gojieba.NewJieba(dicts...)
}
return &Jieba{
jieba: jieba,
}
}
func (j *Jieba) AddWord(words ...string) {
for _, word := range words {
j.jieba.AddWord(word)
}
}
func (j *Jieba) SegWords(text string, length ...int) (words []string) {
topk := 10
if len(length) > 0 {
topk = length[0]
}
wds := j.jieba.Extract(text, topk)
for _, wd := range wds {
// 不是标点且不是空格也不是数字
if unicode.IsSpace(rune(wd[0])) || unicode.IsPunct(rune(wd[0])) || unicode.IsDigit(rune(wd[0])) {
continue
}
words = append(words, wd)
}
return
}
Loading…
Cancel
Save