初步完成文档转换功能

dev
truthhun 1 year ago
parent f1c696666b
commit eeeb9885be

@ -2,6 +2,7 @@ package biz
import (
"context"
"strings"
pb "moredoc/api/v1"
"moredoc/middleware/auth"
@ -72,6 +73,7 @@ func (s *DocumentAPIService) CreateDocument(ctx context.Context, req *pb.CreateD
var (
documents []model.Document
uuidAttachmentIdMap = make(map[string]int64)
jieba = util.NewJieba()
)
for _, doc := range req.Document {
attachment, ok := attachmentMap[doc.AttachmentId]
@ -80,14 +82,15 @@ func (s *DocumentAPIService) CreateDocument(ctx context.Context, req *pb.CreateD
}
doc := model.Document{
Title: doc.Title,
UserId: userCliams.UserId,
UUID: uuid.Must(uuid.NewV4()).String(),
Score: 300,
Price: int(doc.Price),
Size: attachment.Size,
Ext: attachment.Ext,
Status: model.DocumentStatusPending,
Title: doc.Title,
Keywords: strings.Join(jieba.SegWords(doc.Title, 10), ","),
UserId: userCliams.UserId,
UUID: uuid.Must(uuid.NewV4()).String(),
Score: 300,
Price: int(doc.Price),
Size: attachment.Size,
Ext: attachment.Ext,
Status: model.DocumentStatusPending,
}
uuidAttachmentIdMap[doc.UUID] = attachment.Id
documents = append(documents, doc)

File diff suppressed because one or more lines are too long

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

@ -0,0 +1,4 @@
云计算
韩玉鉴赏
蓝翔 nz
区块链 10 nz

@ -29,6 +29,7 @@ require (
github.com/goccy/go-json v0.9.7 // indirect
github.com/golang/freetype v0.0.0-20170609003504-e2365dfdc4a0 // indirect
github.com/pelletier/go-toml/v2 v2.0.1 // indirect
github.com/yanyiwu/gojieba v1.2.0 // indirect
golang.org/x/image v0.0.0-20191009234506-e7c1f5e7dbb8 // indirect
)

@ -176,6 +176,14 @@ func (m *DBModel) DeleteAttachment(ids []int64) (err error) {
return
}
func (m *DBModel) GetAttachmentByTypeAndTypeId(typ int, typeId int64) (attachment Attachment) {
err := m.db.Where("type = ? and type_id = ?", typ, typeId).First(&attachment).Error
if err != nil && err != gorm.ErrRecordNotFound {
m.logger.Error("GetAttachmentByTypeAndTypeId", zap.Error(err))
}
return
}
func (m *DBModel) setAttachmentType(attachmentType int, attachmentTypeId int64, paths []string) {
var hashes []string
for _, path := range paths {

@ -232,10 +232,10 @@ const (
// ConfigConverter 转换配置
type ConfigConverter struct {
MaxPreview int32 `json:"max_preview"` // 文档所允许的最大预览页数0 表示不限制,全部转换
Timeout int32 `json:"timeout"` // 转换超时时间单位为分钟默认30分钟
EnableSVGO bool `json:"enable_svgo"` // 是否对svg启用SVGO压缩。转换效率会有所下降。相对直接的svg文件可以节省1/2的存储空间
EnableGZIP bool `json:"enable_gzip"` // 是否对svg启用GZIP压缩。转换效率会有所下降。相对直接的svg文件可以节省3/4的存储空间
MaxPreview int `json:"max_preview"` // 文档所允许的最大预览页数0 表示不限制,全部转换
Timeout int `json:"timeout"` // 转换超时时间单位为分钟默认30分钟
EnableSVGO bool `json:"enable_svgo"` // 是否对svg启用SVGO压缩。转换效率会有所下降。相对直接的svg文件可以节省1/2的存储空间
EnableGZIP bool `json:"enable_gzip"` // 是否对svg启用GZIP压缩。转换效率会有所下降。相对直接的svg文件可以节省3/4的存储空间
// GZIP和svgo都开启转换效率会有所下降可以综合节省约85%的存储空间
}

@ -2,12 +2,23 @@ package model
import (
"fmt"
"moredoc/util"
"moredoc/util/converter"
"os"
"path/filepath"
"strings"
"time"
"go.uber.org/zap"
"gorm.io/gorm"
)
const (
// 封面按照A4纸的尺寸比例
DocumentCoverWidth = 210
DocumentCoverHeight = 297
)
const (
DocumentStatusPending = iota // 待转换
DocumentStatusConverting // 转换中
@ -437,16 +448,19 @@ func (m *DBModel) CreateDocuments(documents []Document, categoryIds []int64) (do
return
}
// 根据文档hash查询已转换了的文档状态
func (m *DBModel) GetDocumentStatusConvertedByHash(hash []string) (statusMap map[string]int) {
// GetDocumentStatusConvertedByHash 根据文档hash查询已转换了的文档状态
func (m *DBModel) GetDocumentStatusConvertedByHash(hash []string) (hashMapDocuments map[string]Document) {
var (
tableDocument = Document{}.TableName()
tableAttachment = Attachment{}.TableName()
attachMapIndex = make(map[int64]int)
documentIds []int64
docs []Document
)
statusMap = make(map[string]int)
hashMapDocuments = make(map[string]Document)
sql := fmt.Sprintf(
"select a.hash from %s a left join %s d on a.type_id = d.id where a.hash in ? and d.status = ? group by a.hash",
"select a.hash,a.type_id from %s a left join %s d on a.type_id = d.id where a.hash in ? and d.status = ? group by a.hash",
tableAttachment, tableDocument,
)
@ -457,13 +471,19 @@ func (m *DBModel) GetDocumentStatusConvertedByHash(hash []string) (statusMap map
return
}
for _, attachment := range attachemnts {
statusMap[attachment.Hash] = DocumentStatusConverted
for idx, attachment := range attachemnts {
attachMapIndex[attachment.TypeId] = idx
documentIds = append(documentIds, attachment.TypeId)
}
m.db.Where("id in ?", documentIds).Find(&docs)
for _, doc := range docs {
hashMapDocuments[attachemnts[attachMapIndex[doc.Id]].Hash] = doc
}
return
}
// ConvertDocument 文档转换
// ConvertDocument 文档转换。如果err返回gorm.ErrRecordNotFound表示已没有文档需要转换
// 1. 查询待转换的文档
// 2. 文档对应的md5 hash中是否有已转换的文档如果有则直接关联和调整状态为已转换
// 3. 文档转PDF
@ -471,6 +491,125 @@ func (m *DBModel) GetDocumentStatusConvertedByHash(hash []string) (statusMap map
// 5. 根据允许最大的预览页面将PDF转为svg同时转gzip压缩如果有需要的话
// 6. 提取PDF文本以及获取文档信息
// 7. 更新文档状态
func (m *DBModel) ConvertDocument() {
func (m *DBModel) ConvertDocument() (err error) {
var document Document
err = m.db.Where("status = ?", DocumentStatusPending).First(&document).Error
if err != nil {
if err != gorm.ErrRecordNotFound {
m.logger.Error("ConvertDocument", zap.Error(err))
}
return
}
m.SetDocumentStatus(document.Id, DocumentStatusConverting)
attachment := m.GetAttachmentByTypeAndTypeId(AttachmentTypeDocument, document.Id)
if attachment.Id == 0 { // 附件不存在
m.SetDocumentStatus(document.Id, DocumentStatusFailed)
if err != nil {
m.logger.Error("ConvertDocument", zap.Error(err))
}
return
}
// 文档hash
hashMapDocs := m.GetDocumentStatusConvertedByHash([]string{attachment.Hash})
if len(hashMapDocs) > 0 {
// 已有文档转换成功将hash相同的文档相关数据迁移到当前文档
sql := " UPDATE `%s` SET `description`= ?, `cover` = ?, `width` = ?, `height`= ?, `preview`= ?, `pages` = ?, `status` = ? WHERE status in ? and id in (select type_id from `%s` where `hash` = ? and `type` = ?)"
sql = fmt.Sprintf(sql, Document{}.TableName(), Attachment{}.TableName())
for hash, doc := range hashMapDocs {
err = m.db.Exec(sql,
doc.Description, doc.Cover, doc.Width, doc.Height, doc.Preview, doc.Pages, DocumentStatusConverted, []int{DocumentStatusPending, DocumentStatusConverting, DocumentStatusFailed}, hash, AttachmentTypeDocument,
).Error
if err != nil {
m.logger.Error("ConvertDocument", zap.Error(err))
return
}
}
return
}
// 文档转为PDF
cfg := m.GetConfigOfConverter()
timeout := 30 * time.Minute
if cfg.Timeout > 0 {
timeout = time.Duration(cfg.Timeout) * time.Minute
}
localFile := strings.TrimLeft(attachment.Path, "./")
cvt := converter.NewConverter(m.logger, timeout)
dstPDF, err := cvt.ConvertToPDF(localFile)
if err != nil {
m.SetDocumentStatus(document.Id, DocumentStatusFailed)
m.logger.Error("ConvertDocument", zap.Error(err))
return
}
defer os.Remove(dstPDF)
document.Pages, _ = cvt.CountPDFPages(dstPDF)
document.Preview = cfg.MaxPreview
// PDF截取第一章图片作为封面(封面不是最重要的,期间出现错误,不影响文档转换)
pages, err := cvt.ConvertPDFToPNG(dstPDF, 1, 1)
if err != nil {
m.logger.Error("get pdf cover", zap.Error(err))
}
var baseDir = strings.TrimSuffix(localFile, filepath.Ext(localFile))
if len(pages) > 0 {
coverBig := baseDir + "/cover.big.png"
cover := baseDir + "/cover.png"
util.CopyFile(pages[0].PagePath, coverBig)
util.CopyFile(pages[0].PagePath, cover)
util.CropImage(cover, DocumentCoverWidth, DocumentCoverHeight)
document.Width, document.Height, _ = util.GetImageSize(coverBig) // 页面宽高
document.Cover = "/" + cover
}
// PDF转为SVG
toPage := 100000
if cfg.MaxPreview > 0 {
toPage = cfg.MaxPreview
}
pages, err = cvt.ConvertPDFToSVG(dstPDF, 1, toPage, cfg.EnableSVGO, cfg.EnableGZIP)
if err != nil {
m.SetDocumentStatus(document.Id, DocumentStatusFailed)
m.logger.Error("ConvertDocument", zap.Error(err))
return
}
for _, page := range pages {
util.CopyFile(page.PagePath, fmt.Sprintf(baseDir+"/%d%s", page.PageNum, filepath.Ext(page.PagePath)))
os.Remove(page.PagePath)
}
// 提取PDF文本以及获取文档信息
textFile, _ := cvt.ConvertPDFToTxt(dstPDF)
util.CopyFile(textFile, baseDir+"/content.txt")
// 读取文本内容,以提取关键字和摘要
if content, errRead := os.ReadFile(textFile); errRead == nil {
contentStr := string(content)
m.logger.Debug(textFile, zap.String("content", contentStr))
replacer := strings.NewReplacer(" ", "", "\r", " ", "\n", " ", "\t", " ")
document.Description = replacer.Replace(util.Substr(contentStr, 500))
}
os.Remove(textFile)
document.Status = DocumentStatusConverted
err = m.db.Select("description", "cover", "width", "height", "preview", "pages", "status").Where("id = ?", document.Id).Updates(document).Error
if err != nil {
m.SetDocumentStatus(document.Id, DocumentStatusFailed)
m.logger.Error("ConvertDocument", zap.Error(err))
}
return
}
func (m *DBModel) SetDocumentStatus(documentId int64, status int) (err error) {
err = m.db.Model(&Document{}).Where("id = ?", documentId).Update("status", status).Error
if err != nil {
m.logger.Error("SetDocumentStatus", zap.Error(err))
}
return
}

@ -2,11 +2,13 @@ package model
import (
"bytes"
"moredoc/conf"
"os"
"strings"
"testing"
"text/template"
"go.uber.org/zap"
"gorm.io/driver/mysql"
"gorm.io/gorm"
)
@ -75,3 +77,22 @@ func TestGenData(t *testing.T) {
t.Log("生成 data.go 文件成功")
}
func TestConvertDocument(t *testing.T) {
dsn := "root:root@tcp(127.0.0.1)/moredoc?charset=utf8mb4&parseTime=True&loc=Local"
logger, _ := zap.NewDevelopment()
dbModel, err := NewDBModel(&conf.Database{
DSN: dsn,
Prefix: "mnt_",
ShowSQL: true,
}, logger)
if err != nil {
t.Fatal(err.Error())
}
err = dbModel.ConvertDocument()
if err != nil {
t.Fatal(err.Error())
}
t.Log("success")
}

@ -36,19 +36,25 @@ type Page struct {
PagePath string
}
func NewConverter(logger *zap.Logger, cachePath string, timeout ...time.Duration) *Converter {
func NewConverter(logger *zap.Logger, timeout ...time.Duration) *Converter {
expire := 1 * time.Hour
if len(timeout) > 0 {
expire = timeout[0]
}
os.MkdirAll(cachePath, os.ModePerm)
defaultCachePath := "cache/convert"
os.MkdirAll(defaultCachePath, os.ModePerm)
return &Converter{
cachePath: cachePath,
cachePath: defaultCachePath,
timeout: expire,
logger: logger.Named("converter"),
}
}
func (c *Converter) SetCachePath(cachePath string) {
os.MkdirAll(cachePath, os.ModePerm)
c.cachePath = cachePath
}
// ConvertToPDF 将文件转为PDF。
// 自动根据文件类型调用相应的转换函数。
func (c *Converter) ConvertToPDF(src string) (dst string, err error) {
@ -64,6 +70,8 @@ func (c *Converter) ConvertToPDF(src string) (dst string, err error) {
return c.ConvertMOBIToPDF(src)
case ".chm":
return c.ConvertCHMToPDF(src)
case ".pdf":
return c.PDFToPDF(src)
// case ".doc", ".docx", ".rtf", ".wps", ".odt",
// ".xls", ".xlsx", ".et", ".ods",
// ".ppt", ".pptx", ".dps", ".odp", ".pps", ".ppsx", ".pot", ".potx":
@ -112,6 +120,7 @@ func (c *Converter) ConvertPDFToTxt(src string) (dst string, err error) {
c.logger.Debug("convert pdf to txt", zap.String("cmd", mutool), zap.Strings("args", args))
_, err = util.ExecCommand(mutool, args, c.timeout)
if err != nil {
c.logger.Error("convert pdf to txt", zap.String("cmd", mutool), zap.Strings("args", args), zap.Error(err))
return
}
return dst, nil
@ -138,10 +147,11 @@ func (c *Converter) ConvertPDFToSVG(src string, fromPage, toPage int, enableSVGO
}
if enableGZIP { // gzip 压缩
for _, page := range pages {
for idx, page := range pages {
if dst, errCompress := c.CompressSVGByGZIP(page.PagePath); errCompress == nil {
os.Remove(page.PagePath)
page.PagePath = dst
pages[idx] = page
}
}
}
@ -153,10 +163,19 @@ func (c *Converter) ConvertPDFToPNG(src string, fromPage, toPage int) (pages []P
return c.convertPDFToPage(src, fromPage, toPage, ".png")
}
func (c *Converter) PDFToPDF(src string) (dst string, err error) {
dst = strings.ReplaceAll(filepath.Join(c.cachePath, time.Now().Format(dirDteFmt), filepath.Base(src)), "\\", "/")
err = util.CopyFile(src, dst)
if err != nil {
c.logger.Error("copy file error", zap.Error(err))
}
return
}
// ext 可选值: .png, .svg
func (c *Converter) convertPDFToPage(src string, fromPage, toPage int, ext string) (pages []Page, err error) {
pageRange := fmt.Sprintf("%d-%d", fromPage, toPage)
cacheFile := strings.ReplaceAll(filepath.Join(c.cachePath, time.Now().Format(dirDteFmt), filepath.Base(src)+"/%d"+ext), "\\", "/")
cacheFile := strings.ReplaceAll(filepath.Join(c.cachePath, time.Now().Format(dirDteFmt), strings.TrimSuffix(filepath.Base(src), filepath.Ext(src))+"/%d"+ext), "\\", "/")
args := []string{
"convert",
"-o",
@ -177,7 +196,7 @@ func (c *Converter) convertPDFToPage(src string, fromPage, toPage int, ext strin
break
}
pages = append(pages, Page{
PageNum: fromPage + i + 1,
PageNum: fromPage + i,
PagePath: pagePath,
})
}

@ -2,6 +2,7 @@ package converter
import (
"os/exec"
"strings"
"testing"
"time"
@ -19,7 +20,8 @@ var (
func init() {
logger, _ := zap.NewDevelopment()
converter = NewConverter(logger, "../../cache/convert")
converter = NewConverter(logger)
converter.SetCachePath("../../cache/convert")
}
func TestConvertToPDF(t *testing.T) {
@ -238,6 +240,8 @@ func TestCountPDFPages(t *testing.T) {
}
func TestExistCommand(t *testing.T) {
s := "我是中国人"
t.Log(strings.Count(s, "") - 1)
t.Logf("calibre= %v", converter.ExistCalibre())
t.Logf("svgo= %v", converter.ExistSVGO())
t.Logf("mupdf= %v", converter.ExistMupdf())

@ -0,0 +1,56 @@
package util
import (
"unicode"
"github.com/yanyiwu/gojieba"
)
var jieba *gojieba.Jieba
type Jieba struct {
jieba *gojieba.Jieba
}
func NewJieba(dictDir ...string) *Jieba {
defaultDir := "dict"
if len(dictDir) > 0 {
defaultDir = dictDir[0]
}
dicts := []string{
defaultDir + "/jieba.dict.utf8",
defaultDir + "/hmm_model.utf8",
defaultDir + "/user.dict.utf8",
defaultDir + "/idf.utf8",
defaultDir + "/stop_words.utf8",
}
if jieba == nil {
jieba = gojieba.NewJieba(dicts...)
}
return &Jieba{
jieba: jieba,
}
}
func (j *Jieba) AddWord(words ...string) {
for _, word := range words {
j.jieba.AddWord(word)
}
}
func (j *Jieba) SegWords(text string, length ...int) (words []string) {
topk := 10
if len(length) > 0 {
topk = length[0]
}
wds := j.jieba.Extract(text, topk)
for _, wd := range wds {
// 不是标点且不是空格也不是数字
if unicode.IsSpace(rune(wd[0])) || unicode.IsPunct(rune(wd[0])) || unicode.IsDigit(rune(wd[0])) {
continue
}
words = append(words, wd)
}
return
}

@ -68,6 +68,18 @@ func CropImage(file string, width, height int) (err error) {
return imaging.Save(img, file)
}
// GetImageSize 获取图片宽高尺寸信息
func GetImageSize(file string) (width, height int, err error) {
var img image.Image
img, err = imaging.Open(file)
if err != nil {
return
}
width = img.Bounds().Max.X
height = img.Bounds().Max.Y
return
}
// LimitMin 数字最小值限制
func LimitMin(number int, minValue int) int {
if number >= minValue {
@ -113,6 +125,12 @@ func CopyFile(src, dst string) error {
if err != nil {
return fmt.Errorf("couldn't open source file: %s", err)
}
dir := filepath.Dir(dst)
if _, e := os.Stat(dir); os.IsNotExist(e) {
os.MkdirAll(dir, os.ModePerm)
}
outputFile, err := os.Create(dst)
if err != nil {
inputFile.Close()
@ -126,3 +144,24 @@ func CopyFile(src, dst string) error {
}
return nil
}
func Substr(str string, length int, start ...int) string {
s := 0
if len(start) > 0 {
s = start[0]
}
rs := []rune(str)
lth := len(rs)
if s >= lth {
s = lth
}
end := s + length
if end > lth {
end = lth
}
return string(rs[s:end])
}

Loading…
Cancel
Save