From 55d5f6524d87995e0576f97c708d384217962e87 Mon Sep 17 00:00:00 2001 From: Book Pauk Date: Thu, 28 Feb 2019 18:58:41 +0700 Subject: [PATCH] =?UTF-8?q?=D0=A0=D0=B0=D0=B1=D0=BE=D1=82=D0=B0=20=D0=BD?= =?UTF-8?q?=D0=B0=D0=B4=20=D0=BA=D0=BE=D0=BD=D0=B2=D0=B5=D1=80=D1=82=D0=B5?= =?UTF-8?q?=D1=80=D0=BE=D0=BC=20Pdf?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- server/core/BookConverter/ConvertPdf.js | 81 +++++++++++++++++++++++++ server/core/BookConverter/index.js | 1 + server/core/FileDecompressor.js | 2 + 3 files changed, 84 insertions(+) create mode 100644 server/core/BookConverter/ConvertPdf.js diff --git a/server/core/BookConverter/ConvertPdf.js b/server/core/BookConverter/ConvertPdf.js new file mode 100644 index 00000000..a29f495a --- /dev/null +++ b/server/core/BookConverter/ConvertPdf.js @@ -0,0 +1,81 @@ +const fs = require('fs-extra'); + +const sax = require('./sax'); +const utils = require('../utils'); +const ConvertHtml = require('./ConvertHtml'); + +class ConvertPdf extends ConvertHtml { + check(data, opts) { + const {inputFiles} = opts; + + return this.config.useExternalBookConverter && + inputFiles.sourceFileType && inputFiles.sourceFileType.ext == 'pdf'; + } + + async run(notUsed, opts) { + if (!this.check(notUsed, opts)) + return false; + await this.checkExternalConverterPresent(); + + const {inputFiles, callback} = opts; + + const outFile = `${inputFiles.fileListDir}/${utils.randomHexString(10)}.xml`; + + //конвертируем в xml + await this.execConverter(this.pdfToHtmlPath, ['-c', '-s', '-xml', inputFiles.sourceFile, outFile]); + callback(50); + + const data = await fs.readFile(outFile); + callback(60); + + //парсим xml + let lines = []; + let inText = false; + let title = ''; + let i = -1; + + const onTextNode = (text, cutCounter, cutTag) => {// eslint-disable-line no-unused-vars + if (!cutCounter && inText) { + lines[i].text += text + ' '; + if (i < 2) + title += text + ' '; + } + }; + + const onStartNode = (tag, tail, singleTag, cutCounter, cutTag) => {// eslint-disable-line no-unused-vars + if (!cutCounter) { + if (tag == 'text' && !inText) { + inText = true; + i++; + + let attrs = sax.getAttrsSync(tail); + lines[i] = { + text: '', + top: (attrs.top && attrs.top.value ? attrs.top.value : null), + left: (attrs.left && attrs.left.value ? attrs.left.value : null), + }; + } + } + }; + + const onEndNode = (tag, tail, singleTag, cutCounter, cutTag) => {// eslint-disable-line no-unused-vars + if (tag == 'text') + inText = false; + }; + + let buf = this.decode(data).toString(); + sax.parseSync(buf, { + onStartNode, onEndNode, onTextNode + }); + + //найдем параграфы и отступы +console.log(lines.length); + //формируем текст + let text = '' + + text = title + "\n" + text; + return await super.run(Buffer.from(text), {skipCheck: true, isText: true}); + } +} + +module.exports = ConvertPdf; diff --git a/server/core/BookConverter/index.js b/server/core/BookConverter/index.js index 6249b911..6f680324 100644 --- a/server/core/BookConverter/index.js +++ b/server/core/BookConverter/index.js @@ -3,6 +3,7 @@ const FileDetector = require('../FileDetector'); //порядок важен const convertClassFactory = [ + require('./ConvertPdf'), require('./ConvertRtf'), require('./ConvertDocX'), require('./ConvertDoc'), diff --git a/server/core/FileDecompressor.js b/server/core/FileDecompressor.js index 9948147b..73d46538 100644 --- a/server/core/FileDecompressor.js +++ b/server/core/FileDecompressor.js @@ -15,6 +15,8 @@ class FileDecompressor { } async decompressFile(filename, outputDir) { + await fs.ensureDir(outputDir); + const fileType = await this.detector.detectFile(filename); let result = {