From c9b65a3c438a5c72d26c6d3f463f81e0b43cf02b Mon Sep 17 00:00:00 2001 From: Book Pauk Date: Thu, 28 Feb 2019 23:02:34 +0700 Subject: [PATCH] =?UTF-8?q?=D0=A3=D0=BB=D1=83=D1=87=D1=88=D0=B5=D0=BD?= =?UTF-8?q?=D0=B8=D0=B5=20=D0=BA=D0=BE=D0=BD=D0=B2=D0=B5=D1=80=D1=82=D0=B8?= =?UTF-8?q?=D1=80=D0=BE=D0=B2=D0=B0=D0=BD=D0=B8=D1=8F=20Pdf?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- server/core/BookConverter/ConvertHtml.js | 15 +++++++--- server/core/BookConverter/ConvertPdf.js | 38 ++++++++++++------------ 2 files changed, 30 insertions(+), 23 deletions(-) diff --git a/server/core/BookConverter/ConvertHtml.js b/server/core/BookConverter/ConvertHtml.js index a49ba8b1..4bec3ad2 100644 --- a/server/core/BookConverter/ConvertHtml.js +++ b/server/core/BookConverter/ConvertHtml.js @@ -109,13 +109,20 @@ class ConvertHtml extends ConvertBase { //подозрение на чистый текст, надо разбить на параграфы if (isText || pars.length < buf.length/2000) { let total = 0; + let max = 0; for (let i = 0; i < spaceCounter.length; i++) { - total += (spaceCounter[i] ? spaceCounter[i] : 0); + const sc = (spaceCounter[i] ? spaceCounter[i] : 0); + max = (sc > max ? sc : max); + total += sc; } - total /= 20; - let i = spaceCounter.length - 1; - while (i > 0 && (!spaceCounter[i] || spaceCounter[i] < total)) i--; + let i = 0; + //если разброс не слишком большой + if (total < max*2) { + total /= 20; + i = spaceCounter.length - 1; + while (i > 0 && (!spaceCounter[i] || spaceCounter[i] < total)) i--; + } const parIndent = (i > 0 ? i : 0); diff --git a/server/core/BookConverter/ConvertPdf.js b/server/core/BookConverter/ConvertPdf.js index 8b6816ac..ae341afd 100644 --- a/server/core/BookConverter/ConvertPdf.js +++ b/server/core/BookConverter/ConvertPdf.js @@ -22,16 +22,21 @@ class ConvertPdf extends ConvertHtml { const outFile = `${inputFiles.fileListDir}/${utils.randomHexString(10)}.xml`; //конвертируем в xml - await this.execConverter(this.pdfToHtmlPath, ['-c', '-s', '-xml', inputFiles.sourceFile, outFile]); - callback(50); + let perc = 0; + await this.execConverter(this.pdfToHtmlPath, ['-c', '-s', '-xml', inputFiles.sourceFile, outFile], () => { + perc = (perc < 80 ? perc + 10 : 40); + callback(perc); + }); + callback(80); const data = await fs.readFile(outFile); - callback(60); + callback(90); //парсим xml let lines = []; let inText = false; let title = ''; + let prevTop = 0; let i = -1; const onTextNode = (text, cutCounter, cutTag) => {// eslint-disable-line no-unused-vars @@ -48,16 +53,19 @@ class ConvertPdf extends ConvertHtml { let attrs = sax.getAttrsSync(tail); const line = { text: '', - top: (attrs.top && attrs.top.value ? attrs.top.value : null), - left: (attrs.left && attrs.left.value ? attrs.left.value : null), - width: (attrs.width && attrs.width.value ? attrs.width.value : null), - height: (attrs.height && attrs.height.value ? attrs.height.value : null), + top: parseInt((attrs.top && attrs.top.value ? attrs.top.value : null), 10), + left: parseInt((attrs.left && attrs.left.value ? attrs.left.value : null), 10), + width: parseInt((attrs.width && attrs.width.value ? attrs.width.value : null), 10), + height: parseInt((attrs.height && attrs.height.value ? attrs.height.value : null), 10), }; if (line.width !== '0' || line.height !== '0') { inText = true; - i++; - lines[i] = line; + if (isNaN(line.top) || isNaN(prevTop) || (Math.abs(prevTop - line.top) > 3)) { + i++; + lines[i] = line; + } + prevTop = line.top; } } } @@ -76,16 +84,8 @@ class ConvertPdf extends ConvertHtml { //найдем параграфы и отступы const indents = []; for (const line of lines) { - const top = parseInt(line.top); - const left = parseInt(line.left); - - if (!isNaN(top)) { - line.top = top; - } - - if (!isNaN(left)) { - indents[left] = 1; - line.left = left; + if (!isNaN(line.left)) { + indents[line.left] = 1; } }