初步完成文档转换功能

1 year ago · eeeb9885be
parent f1c696666b
commit eeeb9885be
15 changed files with 609697 additions and 27 deletions
--- a/biz/document.go
+++ b/biz/document.go
@ -2,6 +2,7 @@ package biz

 import (
 	"context"
+	"strings"

 	pb "moredoc/api/v1"
 	"moredoc/middleware/auth"
@ -72,6 +73,7 @@ func (s *DocumentAPIService) CreateDocument(ctx context.Context, req *pb.CreateD
 	var (
 		documents           []model.Document
 		uuidAttachmentIdMap = make(map[string]int64)
+		jieba               = util.NewJieba()
 	)
 	for _, doc := range req.Document {
 		attachment, ok := attachmentMap[doc.AttachmentId]
@ -80,14 +82,15 @@ func (s *DocumentAPIService) CreateDocument(ctx context.Context, req *pb.CreateD
 		}

 		doc := model.Document{
-			Title:  doc.Title,
-			UserId: userCliams.UserId,
-			UUID:   uuid.Must(uuid.NewV4()).String(),
-			Score:  300,
-			Price:  int(doc.Price),
-			Size:   attachment.Size,
-			Ext:    attachment.Ext,
-			Status: model.DocumentStatusPending,
+			Title:    doc.Title,
+			Keywords: strings.Join(jieba.SegWords(doc.Title, 10), ","),
+			UserId:   userCliams.UserId,
+			UUID:     uuid.Must(uuid.NewV4()).String(),
+			Score:    300,
+			Price:    int(doc.Price),
+			Size:     attachment.Size,
+			Ext:      attachment.Ext,
+			Status:   model.DocumentStatusPending,
 		}
 		uuidAttachmentIdMap[doc.UUID] = attachment.Id
 		documents = append(documents, doc)
--- a/dict/hmm_model.utf8
+++ b/dict/hmm_model.utf8
--- a/dict/idf.utf8
+++ b/dict/idf.utf8
--- a/dict/jieba.dict.utf8
+++ b/dict/jieba.dict.utf8
--- a/dict/stop_words.utf8
+++ b/dict/stop_words.utf8
--- a/dict/user.dict.utf8
+++ b/dict/user.dict.utf8
@ -0,0 +1,4 @@
+云计算
+韩玉鉴赏
+蓝翔 nz
+区块链 10 nz
--- a/go.mod
+++ b/go.mod
@ -29,6 +29,7 @@ require (
 	github.com/goccy/go-json v0.9.7 // indirect
 	github.com/golang/freetype v0.0.0-20170609003504-e2365dfdc4a0 // indirect
 	github.com/pelletier/go-toml/v2 v2.0.1 // indirect
+	github.com/yanyiwu/gojieba v1.2.0 // indirect
 	golang.org/x/image v0.0.0-20191009234506-e7c1f5e7dbb8 // indirect
 )

--- a/model/attachment.go
+++ b/model/attachment.go
@ -176,6 +176,14 @@ func (m *DBModel) DeleteAttachment(ids []int64) (err error) {
 	return
 }

+func (m *DBModel) GetAttachmentByTypeAndTypeId(typ int, typeId int64) (attachment Attachment) {
+	err := m.db.Where("type = ? and type_id = ?", typ, typeId).First(&attachment).Error
+	if err != nil && err != gorm.ErrRecordNotFound {
+		m.logger.Error("GetAttachmentByTypeAndTypeId", zap.Error(err))
+	}
+	return
+}
+
 func (m *DBModel) setAttachmentType(attachmentType int, attachmentTypeId int64, paths []string) {
 	var hashes []string
 	for _, path := range paths {
--- a/model/config.go
+++ b/model/config.go
@ -232,10 +232,10 @@ const (

 // ConfigConverter 转换配置
 type ConfigConverter struct {
-	MaxPreview int32 `json:"max_preview"` // 文档所允许的最大预览页数，0 表示不限制，全部转换
-	Timeout    int32 `json:"timeout"`     // 转换超时时间，单位为分钟，默认30分钟
-	EnableSVGO bool  `json:"enable_svgo"` // 是否对svg启用SVGO压缩。转换效率会有所下降。相对直接的svg文件，可以节省1/2的存储空间
-	EnableGZIP bool  `json:"enable_gzip"` // 是否对svg启用GZIP压缩。转换效率会有所下降。相对直接的svg文件，可以节省3/4的存储空间
+	MaxPreview int  `json:"max_preview"` // 文档所允许的最大预览页数，0 表示不限制，全部转换
+	Timeout    int  `json:"timeout"`     // 转换超时时间，单位为分钟，默认30分钟
+	EnableSVGO bool `json:"enable_svgo"` // 是否对svg启用SVGO压缩。转换效率会有所下降。相对直接的svg文件，可以节省1/2的存储空间
+	EnableGZIP bool `json:"enable_gzip"` // 是否对svg启用GZIP压缩。转换效率会有所下降。相对直接的svg文件，可以节省3/4的存储空间
 	// GZIP和svgo都开启，转换效率会有所下降，可以综合节省约85%的存储空间
 }

--- a/model/document.go
+++ b/model/document.go
@ -2,12 +2,23 @@ package model

 import (
 	"fmt"
+	"moredoc/util"
+	"moredoc/util/converter"
+	"os"
+	"path/filepath"
+	"strings"
 	"time"

 	"go.uber.org/zap"
 	"gorm.io/gorm"
 )

+const (
+	// 封面，按照A4纸的尺寸比例
+	DocumentCoverWidth  = 210
+	DocumentCoverHeight = 297
+)
+
 const (
 	DocumentStatusPending    = iota // 待转换
 	DocumentStatusConverting        // 转换中
@ -437,16 +448,19 @@ func (m *DBModel) CreateDocuments(documents []Document, categoryIds []int64) (do
 	return
 }

-// 根据文档hash，查询已转换了的文档状态
-func (m *DBModel) GetDocumentStatusConvertedByHash(hash []string) (statusMap map[string]int) {
+// GetDocumentStatusConvertedByHash 根据文档hash，查询已转换了的文档状态
+func (m *DBModel) GetDocumentStatusConvertedByHash(hash []string) (hashMapDocuments map[string]Document) {
 	var (
 		tableDocument   = Document{}.TableName()
 		tableAttachment = Attachment{}.TableName()
+		attachMapIndex  = make(map[int64]int)
+		documentIds     []int64
+		docs            []Document
 	)

-	statusMap = make(map[string]int)
+	hashMapDocuments = make(map[string]Document)
 	sql := fmt.Sprintf(
-		"select a.hash from %s a left join %s d on a.type_id = d.id where a.hash in ? and d.status = ? group by a.hash",
+		"select a.hash,a.type_id from %s a left join %s d on a.type_id = d.id where a.hash in ? and d.status = ? group by a.hash",
 		tableAttachment, tableDocument,
 	)

@ -457,13 +471,19 @@ func (m *DBModel) GetDocumentStatusConvertedByHash(hash []string) (statusMap map
 		return
 	}

-	for _, attachment := range attachemnts {
-		statusMap[attachment.Hash] = DocumentStatusConverted
+	for idx, attachment := range attachemnts {
+		attachMapIndex[attachment.TypeId] = idx
+		documentIds = append(documentIds, attachment.TypeId)
+	}
+
+	m.db.Where("id in ?", documentIds).Find(&docs)
+	for _, doc := range docs {
+		hashMapDocuments[attachemnts[attachMapIndex[doc.Id]].Hash] = doc
 	}
 	return
 }

-// ConvertDocument 文档转换
+// ConvertDocument 文档转换。如果err返回gorm.ErrRecordNotFound，表示已没有文档需要转换
 // 1. 查询待转换的文档
 // 2. 文档对应的md5 hash中，是否有已转换的文档，如果有，则直接关联和调整状态为已转换
 // 3. 文档转PDF
@ -471,6 +491,125 @@ func (m *DBModel) GetDocumentStatusConvertedByHash(hash []string) (statusMap map
 // 5. 根据允许最大的预览页面，将PDF转为svg，同时转gzip压缩，如果有需要的话
 // 6. 提取PDF文本以及获取文档信息
 // 7. 更新文档状态
-func (m *DBModel) ConvertDocument() {
+func (m *DBModel) ConvertDocument() (err error) {
+	var document Document
+	err = m.db.Where("status = ?", DocumentStatusPending).First(&document).Error
+	if err != nil {
+		if err != gorm.ErrRecordNotFound {
+			m.logger.Error("ConvertDocument", zap.Error(err))
+		}
+		return
+	}
+
+	m.SetDocumentStatus(document.Id, DocumentStatusConverting)

+	attachment := m.GetAttachmentByTypeAndTypeId(AttachmentTypeDocument, document.Id)
+	if attachment.Id == 0 { // 附件不存在
+		m.SetDocumentStatus(document.Id, DocumentStatusFailed)
+		if err != nil {
+			m.logger.Error("ConvertDocument", zap.Error(err))
+		}
+		return
+	}
+
+	// 文档hash
+	hashMapDocs := m.GetDocumentStatusConvertedByHash([]string{attachment.Hash})
+	if len(hashMapDocs) > 0 {
+		// 已有文档转换成功，将hash相同的文档相关数据迁移到当前文档
+		sql := " UPDATE `%s` SET `description`= ?, `cover` = ?, `width` = ?, `height`= ?, `preview`= ?, `pages` = ?, `status` = ? WHERE status in ? and id in (select type_id from `%s` where `hash` = ? and `type` = ?)"
+		sql = fmt.Sprintf(sql, Document{}.TableName(), Attachment{}.TableName())
+		for hash, doc := range hashMapDocs {
+			err = m.db.Exec(sql,
+				doc.Description, doc.Cover, doc.Width, doc.Height, doc.Preview, doc.Pages, DocumentStatusConverted, []int{DocumentStatusPending, DocumentStatusConverting, DocumentStatusFailed}, hash, AttachmentTypeDocument,
+			).Error
+			if err != nil {
+				m.logger.Error("ConvertDocument", zap.Error(err))
+				return
+			}
+		}
+		return
+	}
+
+	// 文档转为PDF
+	cfg := m.GetConfigOfConverter()
+	timeout := 30 * time.Minute
+	if cfg.Timeout > 0 {
+		timeout = time.Duration(cfg.Timeout) * time.Minute
+	}
+
+	localFile := strings.TrimLeft(attachment.Path, "./")
+
+	cvt := converter.NewConverter(m.logger, timeout)
+	dstPDF, err := cvt.ConvertToPDF(localFile)
+	if err != nil {
+		m.SetDocumentStatus(document.Id, DocumentStatusFailed)
+		m.logger.Error("ConvertDocument", zap.Error(err))
+		return
+	}
+	defer os.Remove(dstPDF)
+	document.Pages, _ = cvt.CountPDFPages(dstPDF)
+	document.Preview = cfg.MaxPreview
+
+	// PDF截取第一章图片作为封面(封面不是最重要的，期间出现错误，不影响文档转换)
+	pages, err := cvt.ConvertPDFToPNG(dstPDF, 1, 1)
+	if err != nil {
+		m.logger.Error("get pdf cover", zap.Error(err))
+	}
+
+	var baseDir = strings.TrimSuffix(localFile, filepath.Ext(localFile))
+	if len(pages) > 0 {
+		coverBig := baseDir + "/cover.big.png"
+		cover := baseDir + "/cover.png"
+		util.CopyFile(pages[0].PagePath, coverBig)
+		util.CopyFile(pages[0].PagePath, cover)
+		util.CropImage(cover, DocumentCoverWidth, DocumentCoverHeight)
+		document.Width, document.Height, _ = util.GetImageSize(coverBig) // 页面宽高
+		document.Cover = "/" + cover
+	}
+
+	// PDF转为SVG
+	toPage := 100000
+	if cfg.MaxPreview > 0 {
+		toPage = cfg.MaxPreview
+	}
+	pages, err = cvt.ConvertPDFToSVG(dstPDF, 1, toPage, cfg.EnableSVGO, cfg.EnableGZIP)
+	if err != nil {
+		m.SetDocumentStatus(document.Id, DocumentStatusFailed)
+		m.logger.Error("ConvertDocument", zap.Error(err))
+		return
+	}
+
+	for _, page := range pages {
+		util.CopyFile(page.PagePath, fmt.Sprintf(baseDir+"/%d%s", page.PageNum, filepath.Ext(page.PagePath)))
+		os.Remove(page.PagePath)
+	}
+
+	// 提取PDF文本以及获取文档信息
+	textFile, _ := cvt.ConvertPDFToTxt(dstPDF)
+	util.CopyFile(textFile, baseDir+"/content.txt")
+
+	// 读取文本内容，以提取关键字和摘要
+	if content, errRead := os.ReadFile(textFile); errRead == nil {
+		contentStr := string(content)
+		m.logger.Debug(textFile, zap.String("content", contentStr))
+		replacer := strings.NewReplacer(" ", "", "\r", " ", "\n", " ", "\t", " ")
+		document.Description = replacer.Replace(util.Substr(contentStr, 500))
+	}
+	os.Remove(textFile)
+
+	document.Status = DocumentStatusConverted
+	err = m.db.Select("description", "cover", "width", "height", "preview", "pages", "status").Where("id = ?", document.Id).Updates(document).Error
+	if err != nil {
+		m.SetDocumentStatus(document.Id, DocumentStatusFailed)
+		m.logger.Error("ConvertDocument", zap.Error(err))
+	}
+	return
+}
+
+func (m *DBModel) SetDocumentStatus(documentId int64, status int) (err error) {
+	err = m.db.Model(&Document{}).Where("id = ?", documentId).Update("status", status).Error
+	if err != nil {
+		m.logger.Error("SetDocumentStatus", zap.Error(err))
+	}
+	return
 }
--- a/model/model_test.go
+++ b/model/model_test.go
@ -2,11 +2,13 @@ package model

 import (
 	"bytes"
+	"moredoc/conf"
 	"os"
 	"strings"
 	"testing"
 	"text/template"

+	"go.uber.org/zap"
 	"gorm.io/driver/mysql"
 	"gorm.io/gorm"
 )
@ -75,3 +77,22 @@ func TestGenData(t *testing.T) {

 	t.Log("生成 data.go 文件成功")
 }
+
+func TestConvertDocument(t *testing.T) {
+	dsn := "root:root@tcp(127.0.0.1)/moredoc?charset=utf8mb4&parseTime=True&loc=Local"
+	logger, _ := zap.NewDevelopment()
+	dbModel, err := NewDBModel(&conf.Database{
+		DSN:     dsn,
+		Prefix:  "mnt_",
+		ShowSQL: true,
+	}, logger)
+	if err != nil {
+		t.Fatal(err.Error())
+	}
+	err = dbModel.ConvertDocument()
+	if err != nil {
+		t.Fatal(err.Error())
+	}
+
+	t.Log("success")
+}
--- a/util/converter/converter.go
+++ b/util/converter/converter.go
@ -36,19 +36,25 @@ type Page struct {
 	PagePath string
 }

-func NewConverter(logger *zap.Logger, cachePath string, timeout ...time.Duration) *Converter {
+func NewConverter(logger *zap.Logger, timeout ...time.Duration) *Converter {
 	expire := 1 * time.Hour
 	if len(timeout) > 0 {
 		expire = timeout[0]
 	}
-	os.MkdirAll(cachePath, os.ModePerm)
+	defaultCachePath := "cache/convert"
+	os.MkdirAll(defaultCachePath, os.ModePerm)
 	return &Converter{
-		cachePath: cachePath,
+		cachePath: defaultCachePath,
 		timeout:   expire,
 		logger:    logger.Named("converter"),
 	}
 }

+func (c *Converter) SetCachePath(cachePath string) {
+	os.MkdirAll(cachePath, os.ModePerm)
+	c.cachePath = cachePath
+}
+
 // ConvertToPDF 将文件转为PDF。
 // 自动根据文件类型调用相应的转换函数。
 func (c *Converter) ConvertToPDF(src string) (dst string, err error) {
@ -64,6 +70,8 @@ func (c *Converter) ConvertToPDF(src string) (dst string, err error) {
 		return c.ConvertMOBIToPDF(src)
 	case ".chm":
 		return c.ConvertCHMToPDF(src)
+	case ".pdf":
+		return c.PDFToPDF(src)
 	// case ".doc", ".docx", ".rtf", ".wps", ".odt",
 	// 	".xls", ".xlsx", ".et", ".ods",
 	// 	".ppt", ".pptx", ".dps", ".odp", ".pps", ".ppsx", ".pot", ".potx":
@ -112,6 +120,7 @@ func (c *Converter) ConvertPDFToTxt(src string) (dst string, err error) {
 	c.logger.Debug("convert pdf to txt", zap.String("cmd", mutool), zap.Strings("args", args))
 	_, err = util.ExecCommand(mutool, args, c.timeout)
 	if err != nil {
+		c.logger.Error("convert pdf to txt", zap.String("cmd", mutool), zap.Strings("args", args), zap.Error(err))
 		return
 	}
 	return dst, nil
@ -138,10 +147,11 @@ func (c *Converter) ConvertPDFToSVG(src string, fromPage, toPage int, enableSVGO
 	}

 	if enableGZIP { // gzip 压缩
-		for _, page := range pages {
+		for idx, page := range pages {
 			if dst, errCompress := c.CompressSVGByGZIP(page.PagePath); errCompress == nil {
 				os.Remove(page.PagePath)
 				page.PagePath = dst
+				pages[idx] = page
 			}
 		}
 	}
@ -153,10 +163,19 @@ func (c *Converter) ConvertPDFToPNG(src string, fromPage, toPage int) (pages []P
 	return c.convertPDFToPage(src, fromPage, toPage, ".png")
 }

+func (c *Converter) PDFToPDF(src string) (dst string, err error) {
+	dst = strings.ReplaceAll(filepath.Join(c.cachePath, time.Now().Format(dirDteFmt), filepath.Base(src)), "\\", "/")
+	err = util.CopyFile(src, dst)
+	if err != nil {
+		c.logger.Error("copy file error", zap.Error(err))
+	}
+	return
+}
+
 // ext 可选值： .png, .svg
 func (c *Converter) convertPDFToPage(src string, fromPage, toPage int, ext string) (pages []Page, err error) {
 	pageRange := fmt.Sprintf("%d-%d", fromPage, toPage)
-	cacheFile := strings.ReplaceAll(filepath.Join(c.cachePath, time.Now().Format(dirDteFmt), filepath.Base(src)+"/%d"+ext), "\\", "/")
+	cacheFile := strings.ReplaceAll(filepath.Join(c.cachePath, time.Now().Format(dirDteFmt), strings.TrimSuffix(filepath.Base(src), filepath.Ext(src))+"/%d"+ext), "\\", "/")
 	args := []string{
 		"convert",
 		"-o",
@ -177,7 +196,7 @@ func (c *Converter) convertPDFToPage(src string, fromPage, toPage int, ext strin
 			break
 		}
 		pages = append(pages, Page{
-			PageNum:  fromPage + i + 1,
+			PageNum:  fromPage + i,
 			PagePath: pagePath,
 		})
 	}
--- a/util/converter/converter_test.go
+++ b/util/converter/converter_test.go
@ -2,6 +2,7 @@ package converter

 import (
 	"os/exec"
+	"strings"
 	"testing"
 	"time"

@ -19,7 +20,8 @@ var (

 func init() {
 	logger, _ := zap.NewDevelopment()
-	converter = NewConverter(logger, "../../cache/convert")
+	converter = NewConverter(logger)
+	converter.SetCachePath("../../cache/convert")
 }

 func TestConvertToPDF(t *testing.T) {
@ -238,6 +240,8 @@ func TestCountPDFPages(t *testing.T) {
 }

 func TestExistCommand(t *testing.T) {
+	s := "我是中国人"
+	t.Log(strings.Count(s, "") - 1)
 	t.Logf("calibre= %v", converter.ExistCalibre())
 	t.Logf("svgo= %v", converter.ExistSVGO())
 	t.Logf("mupdf= %v", converter.ExistMupdf())
--- a/util/jieba.go
+++ b/util/jieba.go
@ -0,0 +1,56 @@
+package util
+
+import (
+	"unicode"
+
+	"github.com/yanyiwu/gojieba"
+)
+
+var jieba *gojieba.Jieba
+
+type Jieba struct {
+	jieba *gojieba.Jieba
+}
+
+func NewJieba(dictDir ...string) *Jieba {
+	defaultDir := "dict"
+	if len(dictDir) > 0 {
+		defaultDir = dictDir[0]
+	}
+
+	dicts := []string{
+		defaultDir + "/jieba.dict.utf8",
+		defaultDir + "/hmm_model.utf8",
+		defaultDir + "/user.dict.utf8",
+		defaultDir + "/idf.utf8",
+		defaultDir + "/stop_words.utf8",
+	}
+	if jieba == nil {
+		jieba = gojieba.NewJieba(dicts...)
+	}
+	return &Jieba{
+		jieba: jieba,
+	}
+}
+
+func (j *Jieba) AddWord(words ...string) {
+	for _, word := range words {
+		j.jieba.AddWord(word)
+	}
+}
+
+func (j *Jieba) SegWords(text string, length ...int) (words []string) {
+	topk := 10
+	if len(length) > 0 {
+		topk = length[0]
+	}
+	wds := j.jieba.Extract(text, topk)
+	for _, wd := range wds {
+		// 不是标点且不是空格也不是数字
+		if unicode.IsSpace(rune(wd[0])) || unicode.IsPunct(rune(wd[0])) || unicode.IsDigit(rune(wd[0])) {
+			continue
+		}
+		words = append(words, wd)
+	}
+	return
+}
--- a/util/util.go
+++ b/util/util.go
@ -68,6 +68,18 @@ func CropImage(file string, width, height int) (err error) {
 	return imaging.Save(img, file)
 }

+// GetImageSize 获取图片宽高尺寸信息
+func GetImageSize(file string) (width, height int, err error) {
+	var img image.Image
+	img, err = imaging.Open(file)
+	if err != nil {
+		return
+	}
+	width = img.Bounds().Max.X
+	height = img.Bounds().Max.Y
+	return
+}
+
 // LimitMin 数字最小值限制
 func LimitMin(number int, minValue int) int {
 	if number >= minValue {
@ -113,6 +125,12 @@ func CopyFile(src, dst string) error {
 	if err != nil {
 		return fmt.Errorf("couldn't open source file: %s", err)
 	}
+
+	dir := filepath.Dir(dst)
+	if _, e := os.Stat(dir); os.IsNotExist(e) {
+		os.MkdirAll(dir, os.ModePerm)
+	}
+
 	outputFile, err := os.Create(dst)
 	if err != nil {
 		inputFile.Close()
@ -126,3 +144,24 @@ func CopyFile(src, dst string) error {
 	}
 	return nil
 }
+
+func Substr(str string, length int, start ...int) string {
+	s := 0
+	if len(start) > 0 {
+		s = start[0]
+	}
+
+	rs := []rune(str)
+	lth := len(rs)
+
+	if s >= lth {
+		s = lth
+	}
+
+	end := s + length
+	if end > lth {
+		end = lth
+	}
+
+	return string(rs[s:end])
+}