优化分词字典

dev
truthhun 1 year ago
parent 41f7a89993
commit a8b40dd0c1

@ -14,7 +14,7 @@ import (
"moredoc/model"
"moredoc/util"
"moredoc/util/filetil"
"moredoc/util/gse"
"moredoc/util/segword/jieba"
"github.com/golang-jwt/jwt"
"go.uber.org/zap"
@ -89,7 +89,7 @@ func (s *DocumentAPIService) CreateDocument(ctx context.Context, req *pb.CreateD
doc := model.Document{
Title: doc.Title,
Keywords: strings.Join(gse.SegWords(doc.Title), ","),
Keywords: strings.Join(jieba.SegWords(doc.Title), ","),
UserId: userCliams.UserId,
// UUID: uuid.Must(uuid.NewV4()).String(),
Score: 300,

File diff suppressed because it is too large Load Diff

@ -7,7 +7,6 @@ require (
github.com/gin-contrib/cors v1.3.1
github.com/gin-contrib/gzip v0.0.5
github.com/gin-gonic/gin v1.8.1
github.com/go-ego/gse v0.80.2
github.com/gogo/protobuf v1.3.2
github.com/golang/protobuf v1.5.2
github.com/grpc-ecosystem/go-grpc-middleware v1.3.0
@ -31,7 +30,6 @@ require (
github.com/goccy/go-json v0.9.7 // indirect
github.com/golang/freetype v0.0.0-20170609003504-e2365dfdc4a0 // indirect
github.com/pelletier/go-toml/v2 v2.0.1 // indirect
github.com/vcaesar/cedar v0.20.1 // indirect
golang.org/x/image v0.0.0-20191009234506-e7c1f5e7dbb8 // indirect
gopkg.in/alexcesaro/quotedprintable.v3 v3.0.0-20150716171945-2caba252f4dc // indirect
)
@ -45,7 +43,7 @@ require (
github.com/go-playground/locales v0.14.0
github.com/go-playground/universal-translator v0.18.0
github.com/go-playground/validator/v10 v10.10.1
github.com/go-sql-driver/mysql v1.6.0 // indirect
github.com/go-sql-driver/mysql v1.6.0
github.com/gofrs/uuid v4.3.0+incompatible
github.com/golang-jwt/jwt v3.2.2+incompatible
github.com/hashicorp/hcl v1.0.0 // indirect
@ -67,6 +65,7 @@ require (
github.com/spf13/pflag v1.0.5 // indirect
github.com/subosito/gotenv v1.2.0 // indirect
github.com/ugorji/go/codec v1.2.7 // indirect
github.com/wangbin/jiebago v0.3.2
go.uber.org/atomic v1.7.0 // indirect
go.uber.org/multierr v1.6.0 // indirect
golang.org/x/crypto v0.0.0-20211215153901-e495a2d5b3d3 // indirect

@ -133,8 +133,6 @@ github.com/gin-gonic/gin v1.6.3/go.mod h1:75u5sXoLsGZoRN5Sgbi1eraJ4GU3++wFwWzhwv
github.com/gin-gonic/gin v1.7.4/go.mod h1:jD2toBW3GZUr5UMcdrwQA10I7RuaFOl/SGeDjXkfUtY=
github.com/gin-gonic/gin v1.8.1 h1:4+fr/el88TOO3ewCmQr8cx/CtZ/umlIRIs5M4NTNjf8=
github.com/gin-gonic/gin v1.8.1/go.mod h1:ji8BvRH1azfM+SYow9zQ6SZMvR8qOMZHmsCuWR9tTTk=
github.com/go-ego/gse v0.80.2 h1:3LRfkaBuwlsHsmkOZvnhTcsYPXUAhiP06Sqcid7mO1M=
github.com/go-ego/gse v0.80.2/go.mod h1:kesekpZfcFQ/kwd9b27VZHUOH5dQUjaaQUZ4OGt4Hj4=
github.com/go-gl/glfw v0.0.0-20190409004039-e6da0acd62b1/go.mod h1:vR7hzQXu2zJy9AVAgeJqvqgH9Q5CA+iKCZ2gyEVpxRU=
github.com/go-gl/glfw/v3.3/glfw v0.0.0-20191125211704-12ad95a8df72/go.mod h1:tQ2UAYgL5IevRw8kRxooKSPJfGvJ9fJQFa0TUsXzTg8=
github.com/go-gl/glfw/v3.3/glfw v0.0.0-20200222043503-6f7a984d4dc4/go.mod h1:tQ2UAYgL5IevRw8kRxooKSPJfGvJ9fJQFa0TUsXzTg8=
@ -426,9 +424,8 @@ github.com/ugorji/go v1.2.7/go.mod h1:nF9osbDWLy6bDVv/Rtoh6QgnvNDpmCalQV5urGCCS6
github.com/ugorji/go/codec v1.1.7/go.mod h1:Ax+UKWsSmolVDwsd+7N3ZtXu+yMGCf907BLYF3GoBXY=
github.com/ugorji/go/codec v1.2.7 h1:YPXUKf7fYbp/y8xloBqZOw2qaVggbfwMlI8WM3wZUJ0=
github.com/ugorji/go/codec v1.2.7/go.mod h1:WGN1fab3R1fzQlVQTkfxVtIBhWDRqOviHU95kRgeqEY=
github.com/vcaesar/cedar v0.20.1 h1:cDOmYWdprO7ZW8cngJrDi8Zivnscj9dA/y8Y+2SB1P0=
github.com/vcaesar/cedar v0.20.1/go.mod h1:iMDweyuW76RvSrCkQeZeQk4iCbshiPzcCvcGCtpM7iI=
github.com/vcaesar/tt v0.20.0 h1:9t2Ycb9RNHcP0WgQgIaRKJBB+FrRdejuaL6uWIHuoBA=
github.com/wangbin/jiebago v0.3.2 h1:reQKp0xTXWFK7eQ19L6Ofq5xODSR2hcam55qcdCCNpw=
github.com/wangbin/jiebago v0.3.2/go.mod h1:PAqQLauF0qAzy/63jBvO7Goh0oYBq1ocr0OXHLlujwQ=
github.com/yuin/goldmark v1.1.25/go.mod h1:3hX8gzYuyVAZsxl0MRgGTJEmQBFcNTphYh9decYSb74=
github.com/yuin/goldmark v1.1.27/go.mod h1:3hX8gzYuyVAZsxl0MRgGTJEmQBFcNTphYh9decYSb74=
github.com/yuin/goldmark v1.1.32/go.mod h1:3hX8gzYuyVAZsxl0MRgGTJEmQBFcNTphYh9decYSb74=

@ -1,49 +0,0 @@
package gse
import (
"fmt"
"unicode"
"unicode/utf8"
"github.com/go-ego/gse"
"github.com/go-ego/gse/hmm/pos"
)
var (
seg gse.Segmenter
posSeg pos.Segmenter
loadedDict = false
)
func init() {
go loadDict()
}
func loadDict() {
err := seg.LoadDictEmbed()
if err != nil {
fmt.Println("seg.LoadDictEmbed", err)
return
}
err = seg.LoadStopEmbed()
if err != nil {
fmt.Println("seg.LoadStopEmbed", err)
return
}
loadedDict = true
}
func SegWords(text string) (words []string) {
if !loadedDict {
return
}
wds := seg.Cut(text)
for _, wd := range wds {
// 跳过单字、空格、标点、数字
if utf8.RuneCountInString(wd) == 1 || unicode.IsSpace(rune(wd[0])) || unicode.IsPunct(rune(wd[0])) || unicode.IsDigit(rune(wd[0])) {
continue
}
words = append(words, wd)
}
return
}

@ -0,0 +1,67 @@
package jieba
import (
"strings"
"unicode"
"unicode/utf8"
"github.com/wangbin/jiebago"
)
var (
chineseChar = map[string]bool{
"": true, "。": true, "": true, "": true, "、": true, "": true, "": true,
"": true, "": true, "《": true, "》": true, "“": true, "”": true, "": true,
"": true, "【": true, "】": true, "『": true, "』": true, "": true, "": true,
"〈": true, "〉": true, "﹑": true, "●": true, "…": true, "—": true, "": true,
}
seg jiebago.Segmenter
dictPath = "dictionary/dict.txt"
dictLoaded = false
)
func init() {
err := loadDictionary()
if err != nil {
panic(err)
}
}
func loadDictionary(dict ...string) error {
if dictLoaded {
return nil
}
if len(dict) > 0 {
path := dict[0]
if path != "" {
dictPath = path
}
}
err := seg.LoadDictionary(dictPath)
if err == nil {
dictLoaded = true
}
return err
}
func SegWords(text string) []string {
words := make([]string, 0)
exist := make(map[string]bool)
for word := range seg.CutForSearch(text, true) {
wd := strings.TrimSpace(word)
if _, ok := exist[wd]; ok {
continue
}
exist[wd] = true
l := utf8.RuneCountInString(wd)
if !(l < 2 || (l == 1 && (isChineseChar(wd) || unicode.IsPunct(rune(wd[0]))))) { // 跳过空格和标点符号
words = append(words, wd)
}
}
return words
}
func isChineseChar(term string) bool {
_, ok := chineseChar[term]
return ok
}
Loading…
Cancel
Save