初步完成文档转换功能

dev
truthhun 2 years ago
parent f1c696666b
commit eeeb9885be

@ -2,6 +2,7 @@ package biz
import ( import (
"context" "context"
"strings"
pb "moredoc/api/v1" pb "moredoc/api/v1"
"moredoc/middleware/auth" "moredoc/middleware/auth"
@ -72,6 +73,7 @@ func (s *DocumentAPIService) CreateDocument(ctx context.Context, req *pb.CreateD
var ( var (
documents []model.Document documents []model.Document
uuidAttachmentIdMap = make(map[string]int64) uuidAttachmentIdMap = make(map[string]int64)
jieba = util.NewJieba()
) )
for _, doc := range req.Document { for _, doc := range req.Document {
attachment, ok := attachmentMap[doc.AttachmentId] attachment, ok := attachmentMap[doc.AttachmentId]
@ -80,14 +82,15 @@ func (s *DocumentAPIService) CreateDocument(ctx context.Context, req *pb.CreateD
} }
doc := model.Document{ doc := model.Document{
Title: doc.Title, Title: doc.Title,
UserId: userCliams.UserId, Keywords: strings.Join(jieba.SegWords(doc.Title, 10), ","),
UUID: uuid.Must(uuid.NewV4()).String(), UserId: userCliams.UserId,
Score: 300, UUID: uuid.Must(uuid.NewV4()).String(),
Price: int(doc.Price), Score: 300,
Size: attachment.Size, Price: int(doc.Price),
Ext: attachment.Ext, Size: attachment.Size,
Status: model.DocumentStatusPending, Ext: attachment.Ext,
Status: model.DocumentStatusPending,
} }
uuidAttachmentIdMap[doc.UUID] = attachment.Id uuidAttachmentIdMap[doc.UUID] = attachment.Id
documents = append(documents, doc) documents = append(documents, doc)

File diff suppressed because one or more lines are too long

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

@ -0,0 +1,4 @@
云计算
韩玉鉴赏
蓝翔 nz
区块链 10 nz

@ -29,6 +29,7 @@ require (
github.com/goccy/go-json v0.9.7 // indirect github.com/goccy/go-json v0.9.7 // indirect
github.com/golang/freetype v0.0.0-20170609003504-e2365dfdc4a0 // indirect github.com/golang/freetype v0.0.0-20170609003504-e2365dfdc4a0 // indirect
github.com/pelletier/go-toml/v2 v2.0.1 // indirect github.com/pelletier/go-toml/v2 v2.0.1 // indirect
github.com/yanyiwu/gojieba v1.2.0 // indirect
golang.org/x/image v0.0.0-20191009234506-e7c1f5e7dbb8 // indirect golang.org/x/image v0.0.0-20191009234506-e7c1f5e7dbb8 // indirect
) )

@ -176,6 +176,14 @@ func (m *DBModel) DeleteAttachment(ids []int64) (err error) {
return return
} }
func (m *DBModel) GetAttachmentByTypeAndTypeId(typ int, typeId int64) (attachment Attachment) {
err := m.db.Where("type = ? and type_id = ?", typ, typeId).First(&attachment).Error
if err != nil && err != gorm.ErrRecordNotFound {
m.logger.Error("GetAttachmentByTypeAndTypeId", zap.Error(err))
}
return
}
func (m *DBModel) setAttachmentType(attachmentType int, attachmentTypeId int64, paths []string) { func (m *DBModel) setAttachmentType(attachmentType int, attachmentTypeId int64, paths []string) {
var hashes []string var hashes []string
for _, path := range paths { for _, path := range paths {

@ -232,10 +232,10 @@ const (
// ConfigConverter 转换配置 // ConfigConverter 转换配置
type ConfigConverter struct { type ConfigConverter struct {
MaxPreview int32 `json:"max_preview"` // 文档所允许的最大预览页数0 表示不限制,全部转换 MaxPreview int `json:"max_preview"` // 文档所允许的最大预览页数0 表示不限制,全部转换
Timeout int32 `json:"timeout"` // 转换超时时间单位为分钟默认30分钟 Timeout int `json:"timeout"` // 转换超时时间单位为分钟默认30分钟
EnableSVGO bool `json:"enable_svgo"` // 是否对svg启用SVGO压缩。转换效率会有所下降。相对直接的svg文件可以节省1/2的存储空间 EnableSVGO bool `json:"enable_svgo"` // 是否对svg启用SVGO压缩。转换效率会有所下降。相对直接的svg文件可以节省1/2的存储空间
EnableGZIP bool `json:"enable_gzip"` // 是否对svg启用GZIP压缩。转换效率会有所下降。相对直接的svg文件可以节省3/4的存储空间 EnableGZIP bool `json:"enable_gzip"` // 是否对svg启用GZIP压缩。转换效率会有所下降。相对直接的svg文件可以节省3/4的存储空间
// GZIP和svgo都开启转换效率会有所下降可以综合节省约85%的存储空间 // GZIP和svgo都开启转换效率会有所下降可以综合节省约85%的存储空间
} }

@ -2,12 +2,23 @@ package model
import ( import (
"fmt" "fmt"
"moredoc/util"
"moredoc/util/converter"
"os"
"path/filepath"
"strings"
"time" "time"
"go.uber.org/zap" "go.uber.org/zap"
"gorm.io/gorm" "gorm.io/gorm"
) )
const (
// 封面按照A4纸的尺寸比例
DocumentCoverWidth = 210
DocumentCoverHeight = 297
)
const ( const (
DocumentStatusPending = iota // 待转换 DocumentStatusPending = iota // 待转换
DocumentStatusConverting // 转换中 DocumentStatusConverting // 转换中
@ -437,16 +448,19 @@ func (m *DBModel) CreateDocuments(documents []Document, categoryIds []int64) (do
return return
} }
// 根据文档hash查询已转换了的文档状态 // GetDocumentStatusConvertedByHash 根据文档hash查询已转换了的文档状态
func (m *DBModel) GetDocumentStatusConvertedByHash(hash []string) (statusMap map[string]int) { func (m *DBModel) GetDocumentStatusConvertedByHash(hash []string) (hashMapDocuments map[string]Document) {
var ( var (
tableDocument = Document{}.TableName() tableDocument = Document{}.TableName()
tableAttachment = Attachment{}.TableName() tableAttachment = Attachment{}.TableName()
attachMapIndex = make(map[int64]int)
documentIds []int64
docs []Document
) )
statusMap = make(map[string]int) hashMapDocuments = make(map[string]Document)
sql := fmt.Sprintf( sql := fmt.Sprintf(
"select a.hash from %s a left join %s d on a.type_id = d.id where a.hash in ? and d.status = ? group by a.hash", "select a.hash,a.type_id from %s a left join %s d on a.type_id = d.id where a.hash in ? and d.status = ? group by a.hash",
tableAttachment, tableDocument, tableAttachment, tableDocument,
) )
@ -457,13 +471,19 @@ func (m *DBModel) GetDocumentStatusConvertedByHash(hash []string) (statusMap map
return return
} }
for _, attachment := range attachemnts { for idx, attachment := range attachemnts {
statusMap[attachment.Hash] = DocumentStatusConverted attachMapIndex[attachment.TypeId] = idx
documentIds = append(documentIds, attachment.TypeId)
}
m.db.Where("id in ?", documentIds).Find(&docs)
for _, doc := range docs {
hashMapDocuments[attachemnts[attachMapIndex[doc.Id]].Hash] = doc
} }
return return
} }
// ConvertDocument 文档转换 // ConvertDocument 文档转换。如果err返回gorm.ErrRecordNotFound表示已没有文档需要转换
// 1. 查询待转换的文档 // 1. 查询待转换的文档
// 2. 文档对应的md5 hash中是否有已转换的文档如果有则直接关联和调整状态为已转换 // 2. 文档对应的md5 hash中是否有已转换的文档如果有则直接关联和调整状态为已转换
// 3. 文档转PDF // 3. 文档转PDF
@ -471,6 +491,125 @@ func (m *DBModel) GetDocumentStatusConvertedByHash(hash []string) (statusMap map
// 5. 根据允许最大的预览页面将PDF转为svg同时转gzip压缩如果有需要的话 // 5. 根据允许最大的预览页面将PDF转为svg同时转gzip压缩如果有需要的话
// 6. 提取PDF文本以及获取文档信息 // 6. 提取PDF文本以及获取文档信息
// 7. 更新文档状态 // 7. 更新文档状态
func (m *DBModel) ConvertDocument() { func (m *DBModel) ConvertDocument() (err error) {
var document Document
err = m.db.Where("status = ?", DocumentStatusPending).First(&document).Error
if err != nil {
if err != gorm.ErrRecordNotFound {
m.logger.Error("ConvertDocument", zap.Error(err))
}
return
}
m.SetDocumentStatus(document.Id, DocumentStatusConverting)
attachment := m.GetAttachmentByTypeAndTypeId(AttachmentTypeDocument, document.Id)
if attachment.Id == 0 { // 附件不存在
m.SetDocumentStatus(document.Id, DocumentStatusFailed)
if err != nil {
m.logger.Error("ConvertDocument", zap.Error(err))
}
return
}
// 文档hash
hashMapDocs := m.GetDocumentStatusConvertedByHash([]string{attachment.Hash})
if len(hashMapDocs) > 0 {
// 已有文档转换成功将hash相同的文档相关数据迁移到当前文档
sql := " UPDATE `%s` SET `description`= ?, `cover` = ?, `width` = ?, `height`= ?, `preview`= ?, `pages` = ?, `status` = ? WHERE status in ? and id in (select type_id from `%s` where `hash` = ? and `type` = ?)"
sql = fmt.Sprintf(sql, Document{}.TableName(), Attachment{}.TableName())
for hash, doc := range hashMapDocs {
err = m.db.Exec(sql,
doc.Description, doc.Cover, doc.Width, doc.Height, doc.Preview, doc.Pages, DocumentStatusConverted, []int{DocumentStatusPending, DocumentStatusConverting, DocumentStatusFailed}, hash, AttachmentTypeDocument,
).Error
if err != nil {
m.logger.Error("ConvertDocument", zap.Error(err))
return
}
}
return
}
// 文档转为PDF
cfg := m.GetConfigOfConverter()
timeout := 30 * time.Minute
if cfg.Timeout > 0 {
timeout = time.Duration(cfg.Timeout) * time.Minute
}
localFile := strings.TrimLeft(attachment.Path, "./")
cvt := converter.NewConverter(m.logger, timeout)
dstPDF, err := cvt.ConvertToPDF(localFile)
if err != nil {
m.SetDocumentStatus(document.Id, DocumentStatusFailed)
m.logger.Error("ConvertDocument", zap.Error(err))
return
}
defer os.Remove(dstPDF)
document.Pages, _ = cvt.CountPDFPages(dstPDF)
document.Preview = cfg.MaxPreview
// PDF截取第一章图片作为封面(封面不是最重要的,期间出现错误,不影响文档转换)
pages, err := cvt.ConvertPDFToPNG(dstPDF, 1, 1)
if err != nil {
m.logger.Error("get pdf cover", zap.Error(err))
}
var baseDir = strings.TrimSuffix(localFile, filepath.Ext(localFile))
if len(pages) > 0 {
coverBig := baseDir + "/cover.big.png"
cover := baseDir + "/cover.png"
util.CopyFile(pages[0].PagePath, coverBig)
util.CopyFile(pages[0].PagePath, cover)
util.CropImage(cover, DocumentCoverWidth, DocumentCoverHeight)
document.Width, document.Height, _ = util.GetImageSize(coverBig) // 页面宽高
document.Cover = "/" + cover
}
// PDF转为SVG
toPage := 100000
if cfg.MaxPreview > 0 {
toPage = cfg.MaxPreview
}
pages, err = cvt.ConvertPDFToSVG(dstPDF, 1, toPage, cfg.EnableSVGO, cfg.EnableGZIP)
if err != nil {
m.SetDocumentStatus(document.Id, DocumentStatusFailed)
m.logger.Error("ConvertDocument", zap.Error(err))
return
}
for _, page := range pages {
util.CopyFile(page.PagePath, fmt.Sprintf(baseDir+"/%d%s", page.PageNum, filepath.Ext(page.PagePath)))
os.Remove(page.PagePath)
}
// 提取PDF文本以及获取文档信息
textFile, _ := cvt.ConvertPDFToTxt(dstPDF)
util.CopyFile(textFile, baseDir+"/content.txt")
// 读取文本内容,以提取关键字和摘要
if content, errRead := os.ReadFile(textFile); errRead == nil {
contentStr := string(content)
m.logger.Debug(textFile, zap.String("content", contentStr))
replacer := strings.NewReplacer(" ", "", "\r", " ", "\n", " ", "\t", " ")
document.Description = replacer.Replace(util.Substr(contentStr, 500))
}
os.Remove(textFile)
document.Status = DocumentStatusConverted
err = m.db.Select("description", "cover", "width", "height", "preview", "pages", "status").Where("id = ?", document.Id).Updates(document).Error
if err != nil {
m.SetDocumentStatus(document.Id, DocumentStatusFailed)
m.logger.Error("ConvertDocument", zap.Error(err))
}
return
}
func (m *DBModel) SetDocumentStatus(documentId int64, status int) (err error) {
err = m.db.Model(&Document{}).Where("id = ?", documentId).Update("status", status).Error
if err != nil {
m.logger.Error("SetDocumentStatus", zap.Error(err))
}
return
} }

@ -2,11 +2,13 @@ package model
import ( import (
"bytes" "bytes"
"moredoc/conf"
"os" "os"
"strings" "strings"
"testing" "testing"
"text/template" "text/template"
"go.uber.org/zap"
"gorm.io/driver/mysql" "gorm.io/driver/mysql"
"gorm.io/gorm" "gorm.io/gorm"
) )
@ -75,3 +77,22 @@ func TestGenData(t *testing.T) {
t.Log("生成 data.go 文件成功") t.Log("生成 data.go 文件成功")
} }
func TestConvertDocument(t *testing.T) {
dsn := "root:root@tcp(127.0.0.1)/moredoc?charset=utf8mb4&parseTime=True&loc=Local"
logger, _ := zap.NewDevelopment()
dbModel, err := NewDBModel(&conf.Database{
DSN: dsn,
Prefix: "mnt_",
ShowSQL: true,
}, logger)
if err != nil {
t.Fatal(err.Error())
}
err = dbModel.ConvertDocument()
if err != nil {
t.Fatal(err.Error())
}
t.Log("success")
}

@ -36,19 +36,25 @@ type Page struct {
PagePath string PagePath string
} }
func NewConverter(logger *zap.Logger, cachePath string, timeout ...time.Duration) *Converter { func NewConverter(logger *zap.Logger, timeout ...time.Duration) *Converter {
expire := 1 * time.Hour expire := 1 * time.Hour
if len(timeout) > 0 { if len(timeout) > 0 {
expire = timeout[0] expire = timeout[0]
} }
os.MkdirAll(cachePath, os.ModePerm) defaultCachePath := "cache/convert"
os.MkdirAll(defaultCachePath, os.ModePerm)
return &Converter{ return &Converter{
cachePath: cachePath, cachePath: defaultCachePath,
timeout: expire, timeout: expire,
logger: logger.Named("converter"), logger: logger.Named("converter"),
} }
} }
func (c *Converter) SetCachePath(cachePath string) {
os.MkdirAll(cachePath, os.ModePerm)
c.cachePath = cachePath
}
// ConvertToPDF 将文件转为PDF。 // ConvertToPDF 将文件转为PDF。
// 自动根据文件类型调用相应的转换函数。 // 自动根据文件类型调用相应的转换函数。
func (c *Converter) ConvertToPDF(src string) (dst string, err error) { func (c *Converter) ConvertToPDF(src string) (dst string, err error) {
@ -64,6 +70,8 @@ func (c *Converter) ConvertToPDF(src string) (dst string, err error) {
return c.ConvertMOBIToPDF(src) return c.ConvertMOBIToPDF(src)
case ".chm": case ".chm":
return c.ConvertCHMToPDF(src) return c.ConvertCHMToPDF(src)
case ".pdf":
return c.PDFToPDF(src)
// case ".doc", ".docx", ".rtf", ".wps", ".odt", // case ".doc", ".docx", ".rtf", ".wps", ".odt",
// ".xls", ".xlsx", ".et", ".ods", // ".xls", ".xlsx", ".et", ".ods",
// ".ppt", ".pptx", ".dps", ".odp", ".pps", ".ppsx", ".pot", ".potx": // ".ppt", ".pptx", ".dps", ".odp", ".pps", ".ppsx", ".pot", ".potx":
@ -112,6 +120,7 @@ func (c *Converter) ConvertPDFToTxt(src string) (dst string, err error) {
c.logger.Debug("convert pdf to txt", zap.String("cmd", mutool), zap.Strings("args", args)) c.logger.Debug("convert pdf to txt", zap.String("cmd", mutool), zap.Strings("args", args))
_, err = util.ExecCommand(mutool, args, c.timeout) _, err = util.ExecCommand(mutool, args, c.timeout)
if err != nil { if err != nil {
c.logger.Error("convert pdf to txt", zap.String("cmd", mutool), zap.Strings("args", args), zap.Error(err))
return return
} }
return dst, nil return dst, nil
@ -138,10 +147,11 @@ func (c *Converter) ConvertPDFToSVG(src string, fromPage, toPage int, enableSVGO
} }
if enableGZIP { // gzip 压缩 if enableGZIP { // gzip 压缩
for _, page := range pages { for idx, page := range pages {
if dst, errCompress := c.CompressSVGByGZIP(page.PagePath); errCompress == nil { if dst, errCompress := c.CompressSVGByGZIP(page.PagePath); errCompress == nil {
os.Remove(page.PagePath) os.Remove(page.PagePath)
page.PagePath = dst page.PagePath = dst
pages[idx] = page
} }
} }
} }
@ -153,10 +163,19 @@ func (c *Converter) ConvertPDFToPNG(src string, fromPage, toPage int) (pages []P
return c.convertPDFToPage(src, fromPage, toPage, ".png") return c.convertPDFToPage(src, fromPage, toPage, ".png")
} }
func (c *Converter) PDFToPDF(src string) (dst string, err error) {
dst = strings.ReplaceAll(filepath.Join(c.cachePath, time.Now().Format(dirDteFmt), filepath.Base(src)), "\\", "/")
err = util.CopyFile(src, dst)
if err != nil {
c.logger.Error("copy file error", zap.Error(err))
}
return
}
// ext 可选值: .png, .svg // ext 可选值: .png, .svg
func (c *Converter) convertPDFToPage(src string, fromPage, toPage int, ext string) (pages []Page, err error) { func (c *Converter) convertPDFToPage(src string, fromPage, toPage int, ext string) (pages []Page, err error) {
pageRange := fmt.Sprintf("%d-%d", fromPage, toPage) pageRange := fmt.Sprintf("%d-%d", fromPage, toPage)
cacheFile := strings.ReplaceAll(filepath.Join(c.cachePath, time.Now().Format(dirDteFmt), filepath.Base(src)+"/%d"+ext), "\\", "/") cacheFile := strings.ReplaceAll(filepath.Join(c.cachePath, time.Now().Format(dirDteFmt), strings.TrimSuffix(filepath.Base(src), filepath.Ext(src))+"/%d"+ext), "\\", "/")
args := []string{ args := []string{
"convert", "convert",
"-o", "-o",
@ -177,7 +196,7 @@ func (c *Converter) convertPDFToPage(src string, fromPage, toPage int, ext strin
break break
} }
pages = append(pages, Page{ pages = append(pages, Page{
PageNum: fromPage + i + 1, PageNum: fromPage + i,
PagePath: pagePath, PagePath: pagePath,
}) })
} }

@ -2,6 +2,7 @@ package converter
import ( import (
"os/exec" "os/exec"
"strings"
"testing" "testing"
"time" "time"
@ -19,7 +20,8 @@ var (
func init() { func init() {
logger, _ := zap.NewDevelopment() logger, _ := zap.NewDevelopment()
converter = NewConverter(logger, "../../cache/convert") converter = NewConverter(logger)
converter.SetCachePath("../../cache/convert")
} }
func TestConvertToPDF(t *testing.T) { func TestConvertToPDF(t *testing.T) {
@ -238,6 +240,8 @@ func TestCountPDFPages(t *testing.T) {
} }
func TestExistCommand(t *testing.T) { func TestExistCommand(t *testing.T) {
s := "我是中国人"
t.Log(strings.Count(s, "") - 1)
t.Logf("calibre= %v", converter.ExistCalibre()) t.Logf("calibre= %v", converter.ExistCalibre())
t.Logf("svgo= %v", converter.ExistSVGO()) t.Logf("svgo= %v", converter.ExistSVGO())
t.Logf("mupdf= %v", converter.ExistMupdf()) t.Logf("mupdf= %v", converter.ExistMupdf())

@ -0,0 +1,56 @@
package util
import (
"unicode"
"github.com/yanyiwu/gojieba"
)
var jieba *gojieba.Jieba
type Jieba struct {
jieba *gojieba.Jieba
}
func NewJieba(dictDir ...string) *Jieba {
defaultDir := "dict"
if len(dictDir) > 0 {
defaultDir = dictDir[0]
}
dicts := []string{
defaultDir + "/jieba.dict.utf8",
defaultDir + "/hmm_model.utf8",
defaultDir + "/user.dict.utf8",
defaultDir + "/idf.utf8",
defaultDir + "/stop_words.utf8",
}
if jieba == nil {
jieba = gojieba.NewJieba(dicts...)
}
return &Jieba{
jieba: jieba,
}
}
func (j *Jieba) AddWord(words ...string) {
for _, word := range words {
j.jieba.AddWord(word)
}
}
func (j *Jieba) SegWords(text string, length ...int) (words []string) {
topk := 10
if len(length) > 0 {
topk = length[0]
}
wds := j.jieba.Extract(text, topk)
for _, wd := range wds {
// 不是标点且不是空格也不是数字
if unicode.IsSpace(rune(wd[0])) || unicode.IsPunct(rune(wd[0])) || unicode.IsDigit(rune(wd[0])) {
continue
}
words = append(words, wd)
}
return
}

@ -68,6 +68,18 @@ func CropImage(file string, width, height int) (err error) {
return imaging.Save(img, file) return imaging.Save(img, file)
} }
// GetImageSize 获取图片宽高尺寸信息
func GetImageSize(file string) (width, height int, err error) {
var img image.Image
img, err = imaging.Open(file)
if err != nil {
return
}
width = img.Bounds().Max.X
height = img.Bounds().Max.Y
return
}
// LimitMin 数字最小值限制 // LimitMin 数字最小值限制
func LimitMin(number int, minValue int) int { func LimitMin(number int, minValue int) int {
if number >= minValue { if number >= minValue {
@ -113,6 +125,12 @@ func CopyFile(src, dst string) error {
if err != nil { if err != nil {
return fmt.Errorf("couldn't open source file: %s", err) return fmt.Errorf("couldn't open source file: %s", err)
} }
dir := filepath.Dir(dst)
if _, e := os.Stat(dir); os.IsNotExist(e) {
os.MkdirAll(dir, os.ModePerm)
}
outputFile, err := os.Create(dst) outputFile, err := os.Create(dst)
if err != nil { if err != nil {
inputFile.Close() inputFile.Close()
@ -126,3 +144,24 @@ func CopyFile(src, dst string) error {
} }
return nil return nil
} }
func Substr(str string, length int, start ...int) string {
s := 0
if len(start) > 0 {
s = start[0]
}
rs := []rune(str)
lth := len(rs)
if s >= lth {
s = lth
}
end := s + length
if end > lth {
end = lth
}
return string(rs[s:end])
}

Loading…
Cancel
Save