From 9906dd43c791d5f5085a9e6575bb515b0d9df416 Mon Sep 17 00:00:00 2001 From: Book Pauk Date: Mon, 14 Dec 2020 02:22:38 +0700 Subject: [PATCH] =?UTF-8?q?=D0=A0=D0=B0=D0=B1=D0=BE=D1=82=D0=B0=20=D0=BD?= =?UTF-8?q?=D0=B0=D0=B4=20=D0=BA=D0=BE=D0=BD=D0=B2=D0=B5=D1=80=D1=82=D0=B5?= =?UTF-8?q?=D1=80=D0=BE=D0=BC=20pdf?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../core/Reader/BookConverter/ConvertPdf.js | 177 ++++++++++-------- 1 file changed, 95 insertions(+), 82 deletions(-) diff --git a/server/core/Reader/BookConverter/ConvertPdf.js b/server/core/Reader/BookConverter/ConvertPdf.js index 7645713b..a41a31e6 100644 --- a/server/core/Reader/BookConverter/ConvertPdf.js +++ b/server/core/Reader/BookConverter/ConvertPdf.js @@ -55,6 +55,10 @@ class ConvertPdf extends ConvertHtml { let images = []; let loading = []; + let inText = false; + let bold = false; + let italic = false; + let i = -1; const loadImage = async(image) => { @@ -81,22 +85,30 @@ class ConvertPdf extends ConvertHtml { } }; + const isTextBold = (text) => { + const m = text.trim().match(/^(.*)<\/b>$/); + return m && !m[1].match(/|<\/b>||<\/i>/g); + }; + + const isTextEmpty = (text) => { + return text.replace(/|<\/b>||<\/i>/g, '').trim() == ''; + }; + const putPageLines = () => { - pagelines.sort((a, b) => (a.top - b.top)*10000 + (a.left - b.left)) + pagelines.sort((a, b) => (Math.abs(a.top - b.top) > 3 ? a.top - b.top : 0)*10000 + (a.left - b.left)) //объединяем в одну строку равные по высоте const pl = []; let pt = 0; let j = -1; pagelines.forEach(line => { - //добавим закрывающий тег стиля - line.text += line.tClose; + if (isTextEmpty(line.text)) + return; //проверим, возможно это заголовок - if (line.fonts.length == 1 && line.pageWidth) { - const f = (line.fonts.length ? fonts[line.fonts[0]] : null); + if (line.fontId && line.pageWidth) { const centerLeft = (line.pageWidth - line.width)/2; - if (f && f.isBold && Math.abs(centerLeft - line.left) < 3) { + if (isTextBold(line.text) && Math.abs(centerLeft - line.left) < 10) { if (!sectionTitleFound) { line.isSectionTitle = true; sectionTitleFound = true; @@ -124,8 +136,8 @@ class ConvertPdf extends ConvertHtml { //добавим пустую строку, если надо const prevLine = (i > lastIndex ? lines[i] : {fonts: [], top: 0}); if (prevLine && !prevLine.isImage) { - const f = (prevLine.fonts.length ? fonts[prevLine.fonts[0]] : (line.fonts.length ? fonts[line.fonts[0]] : null)); - if (f && f.fontSize && !line.isImage && line.top - prevLine.top > f.fontSize*1.8) { + const f = (prevLine.fontId ? fonts[prevLine.fontId] : (line.fontId ? fonts[line.fontId] : null)); + if (f && f.fontSize && !line.isImage && line.top - prevLine.top > f.fontSize * 1.8) { i++; lines[i] = {text: '
'}; } @@ -138,29 +150,26 @@ class ConvertPdf extends ConvertHtml { putImage(100000); }; + const onTextNode = (text, cutCounter, cutTag) => {// eslint-disable-line no-unused-vars + if (!cutCounter && inText) { + let tOpen = (bold ? '' : ''); + tOpen += (italic ? '' : ''); + let tClose = (italic ? '' : ''); + tClose += (bold ? '' : ''); + + line.text += ` ${tOpen}${text}${tClose}`; + } + }; + const onStartNode = (tag, tail, singleTag, cutCounter, cutTag) => {// eslint-disable-line no-unused-vars - if (tag == 'textstyle') { - const attrs = sax.getAttrsSync(tail); - const fontId = (attrs.id && attrs.id.value ? attrs.id.value : ''); - const fontStyle = (attrs.fontstyle && attrs.fontstyle.value ? attrs.fontstyle.value : ''); - const fontSize = (attrs.fontsize && attrs.fontsize.value ? attrs.fontsize.value : ''); - - if (fontId) { - const styleTags = {bold: 'b', italics: 'i', superscript: 'sup', subscript: 'sub'}; - const f = fonts[fontId] = {tOpen: '', tClose: '', isBold: false, fontSize}; - - if (fontStyle) { - const styles = fontStyle.split(' '); - styles.forEach(style => { - const s = styleTags[style]; - if (s) { - f.tOpen += `<${s}>`; - f.tClose = `${f.tClose}`; - if (s == 'b') - f.isBold = true; - } - }); - } + if (inText) { + switch (tag) { + case 'i': + italic = true; + break; + case 'b': + bold = true; + break; } } @@ -173,80 +182,78 @@ class ConvertPdf extends ConvertHtml { putPageLines(); } - if (tag == 'textline') { + if (tag == 'fontspec') { + const attrs = sax.getAttrsSync(tail); + const fontId = (attrs.id && attrs.id.value ? attrs.id.value : ''); + const fontSize = (attrs.size && attrs.size.value ? attrs.size.value : ''); + + if (fontId) { + fonts[fontId] = {fontSize}; + + } + } + + if (tag == 'text' && !inText) { const attrs = sax.getAttrsSync(tail); line = { text: '', - top: parseInt((attrs.vpos && attrs.vpos.value ? attrs.vpos.value : null), 10), - left: parseInt((attrs.hpos && attrs.hpos.value ? attrs.hpos.value : null), 10), + top: parseInt((attrs.top && attrs.top.value ? attrs.top.value : null), 10), + left: parseInt((attrs.left && attrs.left.value ? attrs.left.value : null), 10), width: parseInt((attrs.width && attrs.width.value ? attrs.width.value : null), 10), height: parseInt((attrs.height && attrs.height.value ? attrs.height.value : null), 10), - tOpen: '', - tClose: '', isSectionTitle: false, isSubtitle: false, pageWidth: page.width, - fonts: [], + fontId: (attrs.font && attrs.font.value ? attrs.font.value : ''), }; if (line.width != 0 || line.height != 0) { + inText = true; pagelines.push(line); } } - if (tag == 'string') { + if (tag == 'image') { const attrs = sax.getAttrsSync(tail); - if (attrs.content && attrs.content.value) { + let src = (attrs.src && attrs.src.value ? attrs.src.value : ''); + if (src) { + const image = { + isImage: true, + src, + data: '', + type: '', + top: parseInt((attrs.top && attrs.top.value ? attrs.top.value : null), 10) || 0, + left: parseInt((attrs.left && attrs.left.value ? attrs.left.value : null), 10) || 0, + width: parseInt((attrs.width && attrs.width.value ? attrs.width.value : null), 10) || 0, + height: parseInt((attrs.height && attrs.height.value ? attrs.height.value : null), 10) || 0, + }; - let tOpen = ''; - let tClose = ''; - const fontId = (attrs.stylerefs && attrs.stylerefs.value ? attrs.stylerefs.value : ''); - if (fontId && fonts[fontId]) { - tOpen = fonts[fontId].tOpen; - tClose = fonts[fontId].tClose; - if (!line.fonts.length || line.fonts[0] != fontId) - line.fonts.push(fontId); - } - - if (line.tOpen != tOpen) { - line.text += line.tClose + tOpen; - line.tOpen = tOpen; - line.tClose = tClose; - } - - line.text += `${line.text.length ? ' ' : ''}${attrs.content.value}`; - } - } - - if (tag == 'illustration') { - const attrs = sax.getAttrsSync(tail); - if (attrs.type && attrs.type.value == 'image') { - let src = (attrs.fileid && attrs.fileid.value ? attrs.fileid.value : ''); - if (src) { - const image = { - isImage: true, - src, - data: '', - type: '', - top: parseInt((attrs.vpos && attrs.vpos.value ? attrs.vpos.value : null), 10) || 0, - left: parseInt((attrs.hpos && attrs.hpos.value ? attrs.hpos.value : null), 10) || 0, - width: parseInt((attrs.width && attrs.width.value ? attrs.width.value : null), 10) || 0, - height: parseInt((attrs.height && attrs.height.value ? attrs.height.value : null), 10) || 0, - }; - const exists = images.filter(img => (img.top == image.top && img.left == image.left && img.width == image.width && img.height == image.height)); - if (!exists.length) { - loading.push(loadImage(image)); - images.push(image); - images.sort((a, b) => (a.top - b.top)*10000 + (a.left - b.left)); - } - } + loading.push(loadImage(image)); + images.push(image); + images.sort((a, b) => (a.top - b.top)*10000 + (a.left - b.left)); } } }; + const onEndNode = (tag, tail, singleTag, cutCounter, cutTag) => {// eslint-disable-line no-unused-vars + if (inText) { + switch (tag) { + case 'i': + italic = false; + break; + case 'b': + bold = false; + break; + } + } + + if (tag == 'text') + inText = false; + }; + let buf = this.decode(data).toString(); sax.parseSync(buf, { - onStartNode + onStartNode, onEndNode, onTextNode }); putPageLines(); @@ -290,6 +297,7 @@ class ConvertPdf extends ConvertHtml { let concat = ''; let sp = ''; + let firstLine = true; for (const line of lines) { if (text.length > limitSize) { throw new Error(`Файл для конвертирования слишком большой|FORLOG| text.length: ${text.length} > ${limitSize}`); @@ -301,10 +309,15 @@ class ConvertPdf extends ConvertHtml { } if (line.isSectionTitle) { - text += `${line.text.trim()}`; + if (firstLine) + text += `${line.text.trim()}`; + else + text += `${line.text.trim()}`; continue; } + firstLine = false; + if (line.isSubtitle) { text += `
${line.text.trim()}`; continue;