初步完成文档转换功能

2 years ago · eeeb9885be
parent f1c696666b
commit eeeb9885be
15 changed files with 609697 additions and 27 deletions
--- a/biz/document.go
+++ b/biz/document.go
@ -2,6 +2,7 @@ package biz
 import (
 	"context"
 	"strings"
 	pb "moredoc/api/v1"
 	"moredoc/middleware/auth"
@ -72,6 +73,7 @@ func (s *DocumentAPIService) CreateDocument(ctx context.Context, req *pb.CreateD
 	var (
 		documents           []model.Document
 		uuidAttachmentIdMap = make(map[string]int64)
 		jieba               = util.NewJieba()
 	)
 	for _, doc := range req.Document {
 		attachment, ok := attachmentMap[doc.AttachmentId]
@ -80,14 +82,15 @@ func (s *DocumentAPIService) CreateDocument(ctx context.Context, req *pb.CreateD
 		}
 		doc := model.Document{
-			Title:  doc.Title,
+			Title:    doc.Title,
-			UserId: userCliams.UserId,
+			Keywords: strings.Join(jieba.SegWords(doc.Title, 10), ","),
-			UUID:   uuid.Must(uuid.NewV4()).String(),
+			UserId:   userCliams.UserId,
-			Score:  300,
+			UUID:     uuid.Must(uuid.NewV4()).String(),
-			Price:  int(doc.Price),
+			Score:    300,
-			Size:   attachment.Size,
+			Price:    int(doc.Price),
-			Ext:    attachment.Ext,
+			Size:     attachment.Size,
-			Status: model.DocumentStatusPending,
+			Ext:      attachment.Ext,
 			Status:   model.DocumentStatusPending,
 		}
 		uuidAttachmentIdMap[doc.UUID] = attachment.Id
 		documents = append(documents, doc)
--- a/dict/hmm_model.utf8
+++ b/dict/hmm_model.utf8
--- a/dict/idf.utf8
+++ b/dict/idf.utf8
--- a/dict/jieba.dict.utf8
+++ b/dict/jieba.dict.utf8
--- a/dict/stop_words.utf8
+++ b/dict/stop_words.utf8
--- a/dict/user.dict.utf8
+++ b/dict/user.dict.utf8
@ -0,0 +1,4 @@
 云计算
 韩玉鉴赏
 蓝翔 nz
 区块链 10 nz
--- a/go.mod
+++ b/go.mod
@ -29,6 +29,7 @@ require (
 	github.com/goccy/go-json v0.9.7 // indirect
 	github.com/golang/freetype v0.0.0-20170609003504-e2365dfdc4a0 // indirect
 	github.com/pelletier/go-toml/v2 v2.0.1 // indirect
 	github.com/yanyiwu/gojieba v1.2.0 // indirect
 	golang.org/x/image v0.0.0-20191009234506-e7c1f5e7dbb8 // indirect
 )
--- a/model/attachment.go
+++ b/model/attachment.go
@ -176,6 +176,14 @@ func (m *DBModel) DeleteAttachment(ids []int64) (err error) {
 	return
 }
 func (m *DBModel) GetAttachmentByTypeAndTypeId(typ int, typeId int64) (attachment Attachment) {
 	err := m.db.Where("type = ? and type_id = ?", typ, typeId).First(&attachment).Error
 	if err != nil && err != gorm.ErrRecordNotFound {
 		m.logger.Error("GetAttachmentByTypeAndTypeId", zap.Error(err))
 	}
 	return
 }
 func (m *DBModel) setAttachmentType(attachmentType int, attachmentTypeId int64, paths []string) {
 	var hashes []string
 	for _, path := range paths {
--- a/model/config.go
+++ b/model/config.go
@ -232,10 +232,10 @@ const (
 // ConfigConverter 转换配置
 type ConfigConverter struct {
-	MaxPreview int32 `json:"max_preview"` // 文档所允许的最大预览页数，0 表示不限制，全部转换
+	MaxPreview int  `json:"max_preview"` // 文档所允许的最大预览页数，0 表示不限制，全部转换
-	Timeout    int32 `json:"timeout"`     // 转换超时时间，单位为分钟，默认30分钟
+	Timeout    int  `json:"timeout"`     // 转换超时时间，单位为分钟，默认30分钟
-	EnableSVGO bool  `json:"enable_svgo"` // 是否对svg启用SVGO压缩。转换效率会有所下降。相对直接的svg文件，可以节省1/2的存储空间
+	EnableSVGO bool `json:"enable_svgo"` // 是否对svg启用SVGO压缩。转换效率会有所下降。相对直接的svg文件，可以节省1/2的存储空间
-	EnableGZIP bool  `json:"enable_gzip"` // 是否对svg启用GZIP压缩。转换效率会有所下降。相对直接的svg文件，可以节省3/4的存储空间
+	EnableGZIP bool `json:"enable_gzip"` // 是否对svg启用GZIP压缩。转换效率会有所下降。相对直接的svg文件，可以节省3/4的存储空间
 	// GZIP和svgo都开启，转换效率会有所下降，可以综合节省约85%的存储空间
 }
--- a/model/document.go
+++ b/model/document.go
@ -2,12 +2,23 @@ package model
 import (
 	"fmt"
 	"moredoc/util"
 	"moredoc/util/converter"
 	"os"
 	"path/filepath"
 	"strings"
 	"time"
 	"go.uber.org/zap"
 	"gorm.io/gorm"
 )
 const (
 	// 封面，按照A4纸的尺寸比例
 	DocumentCoverWidth  = 210
 	DocumentCoverHeight = 297
 )
 const (
 	DocumentStatusPending    = iota // 待转换
 	DocumentStatusConverting        // 转换中
@ -437,16 +448,19 @@ func (m *DBModel) CreateDocuments(documents []Document, categoryIds []int64) (do
 	return
 }
-// 根据文档hash，查询已转换了的文档状态
+// GetDocumentStatusConvertedByHash 根据文档hash，查询已转换了的文档状态
-func (m *DBModel) GetDocumentStatusConvertedByHash(hash []string) (statusMap map[string]int) {
+func (m *DBModel) GetDocumentStatusConvertedByHash(hash []string) (hashMapDocuments map[string]Document) {
 	var (
 		tableDocument   = Document{}.TableName()
 		tableAttachment = Attachment{}.TableName()
 		attachMapIndex  = make(map[int64]int)
 		documentIds     []int64
 		docs            []Document
 	)
-	statusMap = make(map[string]int)
+	hashMapDocuments = make(map[string]Document)
 	sql := fmt.Sprintf(
-		"select a.hash from %s a left join %s d on a.type_id = d.id where a.hash in ? and d.status = ? group by a.hash",
+		"select a.hash,a.type_id from %s a left join %s d on a.type_id = d.id where a.hash in ? and d.status = ? group by a.hash",
 		tableAttachment, tableDocument,
 	)
@ -457,13 +471,19 @@ func (m *DBModel) GetDocumentStatusConvertedByHash(hash []string) (statusMap map
 		return
 	}
-	for _, attachment := range attachemnts {
+	for idx, attachment := range attachemnts {
-		statusMap[attachment.Hash] = DocumentStatusConverted
+		attachMapIndex[attachment.TypeId] = idx
 		documentIds = append(documentIds, attachment.TypeId)
 	}
 	m.db.Where("id in ?", documentIds).Find(&docs)
 	for _, doc := range docs {
 		hashMapDocuments[attachemnts[attachMapIndex[doc.Id]].Hash] = doc
 	}
 	return
 }
-// ConvertDocument 文档转换
+// ConvertDocument 文档转换。如果err返回gorm.ErrRecordNotFound，表示已没有文档需要转换
 // 1. 查询待转换的文档
 // 2. 文档对应的md5 hash中，是否有已转换的文档，如果有，则直接关联和调整状态为已转换
 // 3. 文档转PDF
@ -471,6 +491,125 @@ func (m *DBModel) GetDocumentStatusConvertedByHash(hash []string) (statusMap map
 // 5. 根据允许最大的预览页面，将PDF转为svg，同时转gzip压缩，如果有需要的话
 // 6. 提取PDF文本以及获取文档信息
 // 7. 更新文档状态
-func (m *DBModel) ConvertDocument() {
+func (m *DBModel) ConvertDocument() (err error) {
 	var document Document
 	err = m.db.Where("status = ?", DocumentStatusPending).First(&document).Error
 	if err != nil {
 		if err != gorm.ErrRecordNotFound {
 			m.logger.Error("ConvertDocument", zap.Error(err))
 		}
 		return
 	}
 	m.SetDocumentStatus(document.Id, DocumentStatusConverting)
 	attachment := m.GetAttachmentByTypeAndTypeId(AttachmentTypeDocument, document.Id)
 	if attachment.Id == 0 { // 附件不存在
 		m.SetDocumentStatus(document.Id, DocumentStatusFailed)
 		if err != nil {
 			m.logger.Error("ConvertDocument", zap.Error(err))
 		}
 		return
 	}
 	// 文档hash
 	hashMapDocs := m.GetDocumentStatusConvertedByHash([]string{attachment.Hash})
 	if len(hashMapDocs) > 0 {
 		// 已有文档转换成功，将hash相同的文档相关数据迁移到当前文档
 		sql := " UPDATE `%s` SET `description`= ?, `cover` = ?, `width` = ?, `height`= ?, `preview`= ?, `pages` = ?, `status` = ? WHERE status in ? and id in (select type_id from `%s` where `hash` = ? and `type` = ?)"
 		sql = fmt.Sprintf(sql, Document{}.TableName(), Attachment{}.TableName())
 		for hash, doc := range hashMapDocs {
 			err = m.db.Exec(sql,
 				doc.Description, doc.Cover, doc.Width, doc.Height, doc.Preview, doc.Pages, DocumentStatusConverted, []int{DocumentStatusPending, DocumentStatusConverting, DocumentStatusFailed}, hash, AttachmentTypeDocument,
 			).Error
 			if err != nil {
 				m.logger.Error("ConvertDocument", zap.Error(err))
 				return
 			}
 		}
 		return
 	}
 	// 文档转为PDF
 	cfg := m.GetConfigOfConverter()
 	timeout := 30 * time.Minute
 	if cfg.Timeout > 0 {
 		timeout = time.Duration(cfg.Timeout) * time.Minute
 	}
 	localFile := strings.TrimLeft(attachment.Path, "./")
 	cvt := converter.NewConverter(m.logger, timeout)
 	dstPDF, err := cvt.ConvertToPDF(localFile)
 	if err != nil {
 		m.SetDocumentStatus(document.Id, DocumentStatusFailed)
 		m.logger.Error("ConvertDocument", zap.Error(err))
 		return
 	}
 	defer os.Remove(dstPDF)
 	document.Pages, _ = cvt.CountPDFPages(dstPDF)
 	document.Preview = cfg.MaxPreview
 	// PDF截取第一章图片作为封面(封面不是最重要的，期间出现错误，不影响文档转换)
 	pages, err := cvt.ConvertPDFToPNG(dstPDF, 1, 1)
 	if err != nil {
 		m.logger.Error("get pdf cover", zap.Error(err))
 	}
 	var baseDir = strings.TrimSuffix(localFile, filepath.Ext(localFile))
 	if len(pages) > 0 {
 		coverBig := baseDir + "/cover.big.png"
 		cover := baseDir + "/cover.png"
 		util.CopyFile(pages[0].PagePath, coverBig)
 		util.CopyFile(pages[0].PagePath, cover)
 		util.CropImage(cover, DocumentCoverWidth, DocumentCoverHeight)
 		document.Width, document.Height, _ = util.GetImageSize(coverBig) // 页面宽高
 		document.Cover = "/" + cover
 	}
 	// PDF转为SVG
 	toPage := 100000
 	if cfg.MaxPreview > 0 {
 		toPage = cfg.MaxPreview
 	}
 	pages, err = cvt.ConvertPDFToSVG(dstPDF, 1, toPage, cfg.EnableSVGO, cfg.EnableGZIP)
 	if err != nil {
 		m.SetDocumentStatus(document.Id, DocumentStatusFailed)
 		m.logger.Error("ConvertDocument", zap.Error(err))
 		return
 	}
 	for _, page := range pages {
 		util.CopyFile(page.PagePath, fmt.Sprintf(baseDir+"/%d%s", page.PageNum, filepath.Ext(page.PagePath)))
 		os.Remove(page.PagePath)
 	}
 	// 提取PDF文本以及获取文档信息
 	textFile, _ := cvt.ConvertPDFToTxt(dstPDF)
 	util.CopyFile(textFile, baseDir+"/content.txt")
 	// 读取文本内容，以提取关键字和摘要
 	if content, errRead := os.ReadFile(textFile); errRead == nil {
 		contentStr := string(content)
 		m.logger.Debug(textFile, zap.String("content", contentStr))
 		replacer := strings.NewReplacer(" ", "", "\r", " ", "\n", " ", "\t", " ")
 		document.Description = replacer.Replace(util.Substr(contentStr, 500))
 	}
 	os.Remove(textFile)
 	document.Status = DocumentStatusConverted
 	err = m.db.Select("description", "cover", "width", "height", "preview", "pages", "status").Where("id = ?", document.Id).Updates(document).Error
 	if err != nil {
 		m.SetDocumentStatus(document.Id, DocumentStatusFailed)
 		m.logger.Error("ConvertDocument", zap.Error(err))
 	}
 	return
 }
 func (m *DBModel) SetDocumentStatus(documentId int64, status int) (err error) {
 	err = m.db.Model(&Document{}).Where("id = ?", documentId).Update("status", status).Error
 	if err != nil {
 		m.logger.Error("SetDocumentStatus", zap.Error(err))
 	}
 	return
 }
--- a/model/model_test.go
+++ b/model/model_test.go
@ -2,11 +2,13 @@ package model
 import (
 	"bytes"
 	"moredoc/conf"
 	"os"
 	"strings"
 	"testing"
 	"text/template"
 	"go.uber.org/zap"
 	"gorm.io/driver/mysql"
 	"gorm.io/gorm"
 )
@ -75,3 +77,22 @@ func TestGenData(t *testing.T) {
 	t.Log("生成 data.go 文件成功")
 }
 func TestConvertDocument(t *testing.T) {
 	dsn := "root:root@tcp(127.0.0.1)/moredoc?charset=utf8mb4&parseTime=True&loc=Local"
 	logger, _ := zap.NewDevelopment()
 	dbModel, err := NewDBModel(&conf.Database{
 		DSN:     dsn,
 		Prefix:  "mnt_",
 		ShowSQL: true,
 	}, logger)
 	if err != nil {
 		t.Fatal(err.Error())
 	}
 	err = dbModel.ConvertDocument()
 	if err != nil {
 		t.Fatal(err.Error())
 	}
 	t.Log("success")
 }
--- a/util/converter/converter.go
+++ b/util/converter/converter.go
@ -36,19 +36,25 @@ type Page struct {
 	PagePath string
 }
-func NewConverter(logger *zap.Logger, cachePath string, timeout ...time.Duration) *Converter {
+func NewConverter(logger *zap.Logger, timeout ...time.Duration) *Converter {
 	expire := 1 * time.Hour
 	if len(timeout) > 0 {
 		expire = timeout[0]
 	}
-	os.MkdirAll(cachePath, os.ModePerm)
+	defaultCachePath := "cache/convert"
 	os.MkdirAll(defaultCachePath, os.ModePerm)
 	return &Converter{
-		cachePath: cachePath,
+		cachePath: defaultCachePath,
 		timeout:   expire,
 		logger:    logger.Named("converter"),
 	}
 }
 func (c *Converter) SetCachePath(cachePath string) {
 	os.MkdirAll(cachePath, os.ModePerm)
 	c.cachePath = cachePath
 }
 // ConvertToPDF 将文件转为PDF。
 // 自动根据文件类型调用相应的转换函数。
 func (c *Converter) ConvertToPDF(src string) (dst string, err error) {
@ -64,6 +70,8 @@ func (c *Converter) ConvertToPDF(src string) (dst string, err error) {
 		return c.ConvertMOBIToPDF(src)
 	case ".chm":
 		return c.ConvertCHMToPDF(src)
 	case ".pdf":
 		return c.PDFToPDF(src)
 	// case ".doc", ".docx", ".rtf", ".wps", ".odt",
 	// 	".xls", ".xlsx", ".et", ".ods",
 	// 	".ppt", ".pptx", ".dps", ".odp", ".pps", ".ppsx", ".pot", ".potx":
@ -112,6 +120,7 @@ func (c *Converter) ConvertPDFToTxt(src string) (dst string, err error) {
 	c.logger.Debug("convert pdf to txt", zap.String("cmd", mutool), zap.Strings("args", args))
 	_, err = util.ExecCommand(mutool, args, c.timeout)
 	if err != nil {
 		c.logger.Error("convert pdf to txt", zap.String("cmd", mutool), zap.Strings("args", args), zap.Error(err))
 		return
 	}
 	return dst, nil
@ -138,10 +147,11 @@ func (c *Converter) ConvertPDFToSVG(src string, fromPage, toPage int, enableSVGO
 	}
 	if enableGZIP { // gzip 压缩
-		for _, page := range pages {
+		for idx, page := range pages {
 			if dst, errCompress := c.CompressSVGByGZIP(page.PagePath); errCompress == nil {
 				os.Remove(page.PagePath)
 				page.PagePath = dst
 				pages[idx] = page
 			}
 		}
 	}
@ -153,10 +163,19 @@ func (c *Converter) ConvertPDFToPNG(src string, fromPage, toPage int) (pages []P
 	return c.convertPDFToPage(src, fromPage, toPage, ".png")
 }
 func (c *Converter) PDFToPDF(src string) (dst string, err error) {
 	dst = strings.ReplaceAll(filepath.Join(c.cachePath, time.Now().Format(dirDteFmt), filepath.Base(src)), "\\", "/")
 	err = util.CopyFile(src, dst)
 	if err != nil {
 		c.logger.Error("copy file error", zap.Error(err))
 	}
 	return
 }
 // ext 可选值： .png, .svg
 func (c *Converter) convertPDFToPage(src string, fromPage, toPage int, ext string) (pages []Page, err error) {
 	pageRange := fmt.Sprintf("%d-%d", fromPage, toPage)
-	cacheFile := strings.ReplaceAll(filepath.Join(c.cachePath, time.Now().Format(dirDteFmt), filepath.Base(src)+"/%d"+ext), "\\", "/")
+	cacheFile := strings.ReplaceAll(filepath.Join(c.cachePath, time.Now().Format(dirDteFmt), strings.TrimSuffix(filepath.Base(src), filepath.Ext(src))+"/%d"+ext), "\\", "/")
 	args := []string{
 		"convert",
 		"-o",
@ -177,7 +196,7 @@ func (c *Converter) convertPDFToPage(src string, fromPage, toPage int, ext strin
 			break
 		}
 		pages = append(pages, Page{
-			PageNum:  fromPage + i + 1,
+			PageNum:  fromPage + i,
 			PagePath: pagePath,
 		})
 	}
--- a/util/converter/converter_test.go
+++ b/util/converter/converter_test.go
@ -2,6 +2,7 @@ package converter
 import (
 	"os/exec"
 	"strings"
 	"testing"
 	"time"
@ -19,7 +20,8 @@ var (
 func init() {
 	logger, _ := zap.NewDevelopment()
-	converter = NewConverter(logger, "../../cache/convert")
+	converter = NewConverter(logger)
 	converter.SetCachePath("../../cache/convert")
 }
 func TestConvertToPDF(t *testing.T) {
@ -238,6 +240,8 @@ func TestCountPDFPages(t *testing.T) {
 }
 func TestExistCommand(t *testing.T) {
 	s := "我是中国人"
 	t.Log(strings.Count(s, "") - 1)
 	t.Logf("calibre= %v", converter.ExistCalibre())
 	t.Logf("svgo= %v", converter.ExistSVGO())
 	t.Logf("mupdf= %v", converter.ExistMupdf())
--- a/util/jieba.go
+++ b/util/jieba.go
@ -0,0 +1,56 @@
 package util
 import (
 	"unicode"
 	"github.com/yanyiwu/gojieba"
 )
 var jieba *gojieba.Jieba
 type Jieba struct {
 	jieba *gojieba.Jieba
 }
 func NewJieba(dictDir ...string) *Jieba {
 	defaultDir := "dict"
 	if len(dictDir) > 0 {
 		defaultDir = dictDir[0]
 	}
 	dicts := []string{
 		defaultDir + "/jieba.dict.utf8",
 		defaultDir + "/hmm_model.utf8",
 		defaultDir + "/user.dict.utf8",
 		defaultDir + "/idf.utf8",
 		defaultDir + "/stop_words.utf8",
 	}
 	if jieba == nil {
 		jieba = gojieba.NewJieba(dicts...)
 	}
 	return &Jieba{
 		jieba: jieba,
 	}
 }
 func (j *Jieba) AddWord(words ...string) {
 	for _, word := range words {
 		j.jieba.AddWord(word)
 	}
 }
 func (j *Jieba) SegWords(text string, length ...int) (words []string) {
 	topk := 10
 	if len(length) > 0 {
 		topk = length[0]
 	}
 	wds := j.jieba.Extract(text, topk)
 	for _, wd := range wds {
 		// 不是标点且不是空格也不是数字
 		if unicode.IsSpace(rune(wd[0])) || unicode.IsPunct(rune(wd[0])) || unicode.IsDigit(rune(wd[0])) {
 			continue
 		}
 		words = append(words, wd)
 	}
 	return
 }
--- a/util/util.go
+++ b/util/util.go
@ -68,6 +68,18 @@ func CropImage(file string, width, height int) (err error) {
 	return imaging.Save(img, file)
 }
 // GetImageSize 获取图片宽高尺寸信息
 func GetImageSize(file string) (width, height int, err error) {
 	var img image.Image
 	img, err = imaging.Open(file)
 	if err != nil {
 		return
 	}
 	width = img.Bounds().Max.X
 	height = img.Bounds().Max.Y
 	return
 }
 // LimitMin 数字最小值限制
 func LimitMin(number int, minValue int) int {
 	if number >= minValue {
@ -113,6 +125,12 @@ func CopyFile(src, dst string) error {
 	if err != nil {
 		return fmt.Errorf("couldn't open source file: %s", err)
 	}
 	dir := filepath.Dir(dst)
 	if _, e := os.Stat(dir); os.IsNotExist(e) {
 		os.MkdirAll(dir, os.ModePerm)
 	}
 	outputFile, err := os.Create(dst)
 	if err != nil {
 		inputFile.Close()
@ -126,3 +144,24 @@ func CopyFile(src, dst string) error {
 	}
 	return nil
 }
 func Substr(str string, length int, start ...int) string {
 	s := 0
 	if len(start) > 0 {
 		s = start[0]
 	}
 	rs := []rune(str)
 	lth := len(rs)
 	if s >= lth {
 		s = lth
 	}
 	end := s + length
 	if end > lth {
 		end = lth
 	}
 	return string(rs[s:end])
 }