From 4bcd45a795dc76ac149d596dff23c16458d741b3 Mon Sep 17 00:00:00 2001 From: Book Pauk Date: Wed, 27 Feb 2019 03:59:08 +0700 Subject: [PATCH] =?UTF-8?q?=D0=A0=D0=B5=D1=84=D0=B0=D0=BA=D1=82=D0=BE?= =?UTF-8?q?=D1=80=D0=B8=D0=BD=D0=B3=20-=20=D0=B2=D1=8B=D0=BD=D0=B5=D1=81?= =?UTF-8?q?=20=D0=BC=D0=B5=D1=82=D0=BE=D0=B4=D1=8B=20=D0=BA=D0=BE=D0=BD?= =?UTF-8?q?=D0=B2=D0=B5=D1=80=D1=82=D0=B8=D1=80=D0=BE=D0=B2=D0=B0=D0=BD?= =?UTF-8?q?=D0=B8=D1=8F=20=D0=B2=20=D0=BE=D1=82=D0=B4=D0=B5=D0=BB=D1=8C?= =?UTF-8?q?=D0=BD=D1=8B=D0=B5=20=D0=BA=D0=BB=D0=B0=D1=81=D1=81=D1=8B?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- server/core/BookConverter/ConvertBase.js | 96 ++++ server/core/BookConverter/ConvertFb2.js | 41 ++ server/core/BookConverter/ConvertHtml.js | 162 +++++++ server/core/BookConverter/ConvertSamlib.js | 275 +++++++++++ server/core/BookConverter/index.js | 533 +-------------------- server/core/FileDetector.js | 24 +- server/core/ReaderWorker.js | 6 +- 7 files changed, 610 insertions(+), 527 deletions(-) create mode 100644 server/core/BookConverter/ConvertBase.js create mode 100644 server/core/BookConverter/ConvertFb2.js create mode 100644 server/core/BookConverter/ConvertHtml.js create mode 100644 server/core/BookConverter/ConvertSamlib.js diff --git a/server/core/BookConverter/ConvertBase.js b/server/core/BookConverter/ConvertBase.js new file mode 100644 index 00000000..460d91c3 --- /dev/null +++ b/server/core/BookConverter/ConvertBase.js @@ -0,0 +1,96 @@ +const iconv = require('iconv-lite'); +const chardet = require('chardet'); +const textUtils = require('./textUtils'); + +class ConvertBase { + constructor(config) { + this.config = config; + } + + run(data, opts) {// eslint-disable-line no-unused-vars + //override + } + + decode(data) { + let selected = textUtils.getEncoding(data); + + if (selected == 'ISO-8859-5') { + const charsetAll = chardet.detectAll(data.slice(0, 20000)); + for (const charset of charsetAll) { + if (charset.name.indexOf('ISO-8859') < 0) { + selected = charset.name; + break; + } + } + } + + if (selected.toLowerCase() != 'utf-8') + return iconv.decode(data, selected); + else + return data; + } + + repSpaces(text) { + return text.replace(/ |[\t\n\r]/g, ' '); + } + + formatFb2(fb2) { + let out = ''; + out += ''; + out += this.formatFb2Node(fb2); + out += ''; + return out; + } + + formatFb2Node(node, name) { + let out = ''; + + if (Array.isArray(node)) { + for (const n of node) { + out += this.formatFb2Node(n); + } + } else if (typeof node == 'string') { + if (name) + out += `<${name}>${this.repSpaces(node)}`; + else + out += this.repSpaces(node); + } else { + if (node._n) + name = node._n; + + let attrs = ''; + if (node._attrs) { + for (let attrName in node._attrs) { + attrs += ` ${attrName}="${node._attrs[attrName]}"`; + } + } + + let tOpen = ''; + let tBody = ''; + let tClose = ''; + if (name) + tOpen += `<${name}${attrs}>`; + if (node.hasOwnProperty('_t')) + tBody += this.repSpaces(node._t); + + for (let nodeName in node) { + if (nodeName && nodeName[0] == '_' && nodeName != '_a') + continue; + + const n = node[nodeName]; + tBody += this.formatFb2Node(n, nodeName); + } + + if (name) + tClose += ``; + + if (attrs == '' && name == 'p' && tBody.trim() == '') + out += '' + else + out += `${tOpen}${tBody}${tClose}`; + } + return out; + } +} + +module.exports = ConvertBase; \ No newline at end of file diff --git a/server/core/BookConverter/ConvertFb2.js b/server/core/BookConverter/ConvertFb2.js new file mode 100644 index 00000000..e4a84866 --- /dev/null +++ b/server/core/BookConverter/ConvertFb2.js @@ -0,0 +1,41 @@ +const ConvertBase = require('./ConvertBase'); +const iconv = require('iconv-lite'); + +class ConvertFb2 extends ConvertBase { + check(data, opts) { + const {fileType} = opts; + + return (fileType && fileType.ext == 'xml' && data.toString().indexOf('= 0); + } + + run(data, opts) { + if (!this.check(data, opts)) + return false; + + return this.checkEncoding(data); + } + + checkEncoding(data) { + let result = data; + + const left = data.indexOf('= 0) { + const right = data.indexOf('?>', left); + if (right >= 0) { + const head = data.slice(left, right + 2).toString(); + const m = head.match(/encoding="(.*)"/); + if (m) { + let encoding = m[1].toLowerCase(); + if (encoding != 'utf-8') { + result = iconv.decode(data, encoding); + result = Buffer.from(result.toString().replace(m[0], 'encoding="utf-8"')); + } + } + } + } + + return result; + } +} + +module.exports = ConvertFb2; diff --git a/server/core/BookConverter/ConvertHtml.js b/server/core/BookConverter/ConvertHtml.js new file mode 100644 index 00000000..fa85937d --- /dev/null +++ b/server/core/BookConverter/ConvertHtml.js @@ -0,0 +1,162 @@ +const ConvertBase = require('./ConvertBase'); +const sax = require('./sax'); +const textUtils = require('./textUtils'); + +class ConvertHtml extends ConvertBase { + check(data, opts) { + const {fileType} = opts; + + if (fileType && (fileType.ext == 'html' || fileType.ext == 'xml')) + return {isText: false}; + + //может это чистый текст? + if (textUtils.checkIfText(data)) { + return {isText: true}; + } + + return false; + } + + run(data, opts) { + const checkResult = this.check(data, opts); + if (!checkResult) + return false; + + let {isText} = checkResult; + let titleInfo = {}; + let desc = {_n: 'description', 'title-info': titleInfo}; + let pars = []; + let body = {_n: 'body', section: {_a: []}}; + let fb2 = [desc, body]; + + let title = ''; + let inTitle = false; + + let spaceCounter = []; + + const repCrLfTab = (text) => text.replace(/[\n\r]/g, '').replace(/\t/g, ' '); + + const newParagraph = () => { + pars.push({_n: 'p', _t: ''}); + }; + + const growParagraph = (text) => { + if (!pars.length) + newParagraph(); + + const l = pars.length; + if (pars[l - 1]._t == '') + text = text.trimLeft(); + pars[l - 1]._t += text; + + //посчитаем отступы у текста, чтобы выделить потом параграфы + const lines = text.split('\n'); + for (let line of lines) { + line = repCrLfTab(line) + + let l = 0; + while (l < line.length && line[l] == ' ') { + l++; + } + if (!spaceCounter[l]) + spaceCounter[l] = 0; + spaceCounter[l]++; + } + }; + + const newPara = new Set(['tr', 'br', 'br/', 'dd', 'p', 'title', '/title', 'h1', 'h2', 'h3', '/h1', '/h2', '/h3']); + + const onTextNode = (text, cutCounter, cutTag) => {// eslint-disable-line no-unused-vars + if (!cutCounter) { + growParagraph(text); + } + + if (inTitle && !title) + title = text; + }; + + const onStartNode = (tag, tail, singleTag, cutCounter, cutTag) => {// eslint-disable-line no-unused-vars + if (!cutCounter) { + if (newPara.has(tag)) + newParagraph(); + } + + if (tag == 'title') + inTitle = true; + }; + + const onEndNode = (tag, tail, singleTag, cutCounter, cutTag) => {// eslint-disable-line no-unused-vars + if (tag == 'title') + inTitle = false; + }; + + let buf = this.decode(data).toString(); + + sax.parseSync(buf, { + onStartNode, onEndNode, onTextNode, + innerCut: new Set(['head', 'script', 'style', 'binary']) + }); + + titleInfo['book-title'] = title; + + //подозрение на чистый текст, надо разбить на параграфы + if (isText || pars.length < buf.length/2000) { + let total = 0; + for (let i = 0; i < spaceCounter.length; i++) { + total += (spaceCounter[i] ? spaceCounter[i] : 0); + } + total /= 10; + let i = spaceCounter.length - 1; + while (i > 0 && (!spaceCounter[i] || spaceCounter[i] < total)) i--; + + const parIndent = (i > 0 ? i : 0); + + let newPars = []; + const newPar = () => { + newPars.push({_n: 'p', _t: ''}); + }; + + const growPar = (text) => { + if (!newPars.length) + newPar(); + + const l = newPars.length; + newPars[l - 1]._t += text; + } + + i = 0; + for (const par of pars) { + if (i > 0) + newPar(); + i++; + + const lines = par._t.split('\n'); + for (let line of lines) { + line = repCrLfTab(line); + + let l = 0; + while (l < line.length && line[l] == ' ') { + l++; + } + + if (l >= parIndent) + newPar(); + growPar(line.trim() + ' '); + } + } + + body.section._a[0] = newPars; + } else { + body.section._a[0] = pars; + } + + //убираем лишнее + for (let i = 0; i < pars.length; i++) + pars[i]._t = this.repSpaces(pars[i]._t).trim(); + + return this.formatFb2(fb2); + } + +} + +module.exports = ConvertHtml; diff --git a/server/core/BookConverter/ConvertSamlib.js b/server/core/BookConverter/ConvertSamlib.js new file mode 100644 index 00000000..40ebd10b --- /dev/null +++ b/server/core/BookConverter/ConvertSamlib.js @@ -0,0 +1,275 @@ +const _ = require('lodash'); +const URL = require('url').URL; + +const sax = require('./sax'); +const ConvertBase = require('./ConvertBase'); + +class ConvertSamlib extends ConvertBase { + check(data, opts) { + const {url} = opts; + + const parsedUrl = new URL(url); + if (parsedUrl.hostname == 'samlib.ru' || + parsedUrl.hostname == 'budclub.ru' || + parsedUrl.hostname == 'zhurnal.lib.ru') { + return {hostname: parsedUrl.hostname}; + } + + return false; + } + + run(data, opts) { + const checkResult = this.check(data, opts); + if (!checkResult) + return false; + + const {hostname} = checkResult; + let titleInfo = {}; + let desc = {_n: 'description', 'title-info': titleInfo}; + let pars = []; + let body = {_n: 'body', section: {_a: pars}}; + let fb2 = [desc, body]; + + let inSubtitle = false; + let inJustify = true; + let inImage = false; + let isFirstPara = false; + let path = ''; + let tag = '';// eslint-disable-line no-unused-vars + + let inText = false; + let textFound = false; + let node = {_a: pars}; + + let inPara = false; + let italic = false; + let bold = false; + + const openTag = (name, attrs) => { + if (name == 'p') + inPara = true; + let n = {_n: name, _attrs: attrs, _a: [], _p: node}; + node._a.push(n); + node = n; + }; + + const closeTag = (name) => { + if (name == 'p') + inPara = false; + if (node._p) { + const exact = (node._n == name); + node = node._p; + if (!exact) + closeTag(name); + } + }; + + const growParagraph = (text) => { + if (!node._p) { + if (text.trim() != '') + openTag('p'); + else + return; + } + if (node._n == 'p' && node._a.length == 0) + text = text.trimLeft(); + node._a.push({_t: text}); + }; + + const onStartNode = (elemName, tail, singleTag, cutCounter, cutTag) => {// eslint-disable-line no-unused-vars + if (elemName == '') + return; + if (!inText) { + path += '/' + elemName; + tag = elemName; + } else { + switch (elemName) { + case 'li': + case 'p': + case 'dd': + case 'br': + if (!(inSubtitle && isFirstPara)) { + if (inPara) + closeTag('p'); + openTag('p'); + } + isFirstPara = false; + break; + case 'h1': + case 'h2': + case 'h3': + if (inPara) + closeTag('p'); + openTag('p'); + bold = true; + break; + case 'i': + case 'em': + italic = true; + break; + case 'b': + case 'strong': + bold = true; + break; + case 'div': + if (inPara) + closeTag('p'); + if (tail.indexOf('align="center"') >= 0) { + openTag('subtitle'); + inSubtitle = true; + isFirstPara = true; + } + + if (tail.indexOf('align="justify"') >= 0) { + openTag('p'); + inJustify = true; + } + + break; + case 'img': { + if (inPara) + closeTag('p'); + const attrs = sax.getAttrsSync(tail); + if (attrs.src && attrs.src.value) { + let href = attrs.src.value; + if (href[0] == '/') + href = `http://${hostname}${href}`; + openTag('image', {href}); + inImage = true; + } + break; + } + } + } + }; + + const onEndNode = (elemName, tail, singleTag, cutCounter, cutTag) => {// eslint-disable-line no-unused-vars + if (!inText) { + const oldPath = path; + let t = ''; + do { + let i = path.lastIndexOf('/'); + t = path.substr(i + 1); + path = path.substr(0, i); + } while (t != elemName && path); + + if (t != elemName) { + path = oldPath; + } + + let i = path.lastIndexOf('/'); + tag = path.substr(i + 1); + } else { + switch (elemName) { + case 'li': + case 'p': + case 'dd': + closeTag('p'); + break; + case 'h1': + case 'h2': + case 'h3': + closeTag('p'); + bold = false; + break; + case 'i': + case 'em': + italic = false; + break; + case 'b': + case 'strong': + bold = false; + break; + case 'div': + if (inSubtitle) { + closeTag('subtitle'); + inSubtitle = false; + isFirstPara = false; + } + + if (inJustify) { + closeTag('p'); + inJustify = false; + } + break; + case 'img': + if (inImage) + closeTag('image'); + inImage = false; + break; + } + } + }; + + const onComment = (text) => {// eslint-disable-line no-unused-vars + if (text == '--------- Собственно произведение -------------') { + inText = true; + textFound = true; + } + if (text == '-----------------------------------------------') + inText = false; + }; + + const onTextNode = (text) => {// eslint-disable-line no-unused-vars + if (text && text.trim() == '') + text = (text.indexOf(' ') >= 0 ? ' ' : ''); + + if (!text) + return; + + switch (path) { + case '/html/body/center/h2': + titleInfo['book-title'] = text; + return; + case '/html/body/div/h3': + if (!titleInfo.author) + titleInfo.author = {}; + text = text.replace(':', '').trim().split(' '); + if (text[0]) + titleInfo.author['last-name'] = text[0]; + if (text[1]) + titleInfo.author['first-name'] = text[1]; + if (text[2]) + titleInfo.author['middle-name'] = text[2]; + return; + } + + let tOpen = (bold ? '' : ''); + tOpen += (italic ? '' : ''); + let tClose = (italic ? '' : ''); + tClose += (bold ? '' : ''); + + if (inText) + growParagraph(`${tOpen}${text}${tClose}`); + }; + + sax.parseSync(this.decode(data).toString().replace(/ /g, ' '), { + onStartNode, onEndNode, onTextNode, onComment, + innerCut: new Set(['head', 'script', 'style']) + }); + + //текст не найден на странице, обработать корректно не получилось + if (!textFound) + return false; + + const title = (titleInfo['book-title'] ? titleInfo['book-title'] : ''); + let author = ''; + if (titleInfo.author) { + author = _.compact([ + (titleInfo.author['last-name'] ? titleInfo.author['last-name'] : ''), + (titleInfo.author['first-name'] ? titleInfo.author['first-name'] : ''), + (titleInfo.author['middle-name'] ? titleInfo.author['middle-name'] : ''), + ]).join(' '); + } + + pars.unshift({_n: 'title', _a: [ + {_n: 'p', _t: author}, {_n: 'p', _t: ''}, + {_n: 'p', _t: title}, {_n: 'p', _t: ''}, + ]}) + + return this.formatFb2(fb2); + } + +} + +module.exports = ConvertSamlib; diff --git a/server/core/BookConverter/index.js b/server/core/BookConverter/index.js index 657ed90b..8d9a6fd3 100644 --- a/server/core/BookConverter/index.js +++ b/server/core/BookConverter/index.js @@ -1,536 +1,45 @@ const fs = require('fs-extra'); -const URL = require('url').URL; -const iconv = require('iconv-lite'); -const chardet = require('chardet'); -const _ = require('lodash'); -const sax = require('./sax'); -const textUtils = require('./textUtils'); - const FileDetector = require('../FileDetector'); -const repSpaces = (text) => text.replace(/ |[\t\n\r]/g, ' '); -const repSpaces2 = (text) => text.replace(/[\n\r]/g, ''); -const repSpaces3 = (text) => text.replace(/ /g, ' '); +//порядок важен +const convertClassFactory = [ + require('./ConvertFb2'), + require('./ConvertSamlib'), + require('./ConvertHtml'), +]; class BookConverter { - constructor() { + constructor(config) { this.detector = new FileDetector(); + + this.convertFactory = []; + for (const convertClass of convertClassFactory) { + this.convertFactory.push(new convertClass(config)); + } } async convertToFb2(inputFile, outputFile, url, callback) { const fileType = await this.detector.detectFile(inputFile); const data = await fs.readFile(inputFile); - callback(100); - - if (fileType && (fileType.ext == 'html' || fileType.ext == 'xml')) { - if (data.toString().indexOf('= 0) { - await fs.writeFile(outputFile, this.checkEncoding(data)); - return; + let result = false; + for (const convert of this.convertFactory) { + result = convert.run(data, {inputFile, url, callback, fileType}); + if (result) { + await fs.writeFile(outputFile, result); + break; } + } - const parsedUrl = new URL(url); - if (parsedUrl.hostname == 'samlib.ru' || - parsedUrl.hostname == 'budclub.ru' || - parsedUrl.hostname == 'zhurnal.lib.ru') { - await fs.writeFile(outputFile, this.convertSamlib(data, parsedUrl.hostname)); - return; - } - - await fs.writeFile(outputFile, this.convertHtml(data)); - return; - } else { + if (!result) { if (fileType) throw new Error(`Этот формат файла не поддерживается: ${fileType.mime}`); else { - //может это чистый текст? - if (textUtils.checkIfText(data)) { - await fs.writeFile(outputFile, this.convertHtml(data, true)); - return; - } - throw new Error(`Не удалось определить формат файла: ${url}`); } } - } - decode(data) { - let selected = textUtils.getEncoding(data); - - if (selected == 'ISO-8859-5') { - const charsetAll = chardet.detectAll(data.slice(0, 20000)); - for (const charset of charsetAll) { - if (charset.name.indexOf('ISO-8859') < 0) { - selected = charset.name; - break; - } - } - } - - if (selected.toLowerCase() != 'utf-8') - return iconv.decode(data, selected); - else - return data; - } - - checkEncoding(data) { - let result = data; - - const left = data.indexOf('= 0) { - const right = data.indexOf('?>', left); - if (right >= 0) { - const head = data.slice(left, right + 2).toString(); - const m = head.match(/encoding="(.*)"/); - if (m) { - let encoding = m[1].toLowerCase(); - if (encoding != 'utf-8') { - result = iconv.decode(data, encoding); - result = Buffer.from(result.toString().replace(m[0], 'encoding="utf-8"')); - } - } - } - } - - return result; - } - - convertHtml(data, isText) { - let titleInfo = {}; - let desc = {_n: 'description', 'title-info': titleInfo}; - let pars = []; - let body = {_n: 'body', section: {_a: []}}; - let fb2 = [desc, body]; - - let title = ''; - let inTitle = false; - - let spaceCounter = []; - - const newParagraph = () => { - pars.push({_n: 'p', _t: ''}); - }; - - const growParagraph = (text) => { - if (!pars.length) - newParagraph(); - - const l = pars.length; - if (pars[l - 1]._t == '') - text = text.trimLeft(); - pars[l - 1]._t += text; - - //посчитаем отступы у текста, чтобы выделить потом параграфы - const lines = text.split('\n'); - for (let line of lines) { - line = repSpaces2(line).replace(/\t/g, ' '); - - let l = 0; - while (l < line.length && line[l] == ' ') { - l++; - } - if (!spaceCounter[l]) - spaceCounter[l] = 0; - spaceCounter[l]++; - } - }; - - const newPara = new Set(['tr', 'br', 'br/', 'dd', 'p', 'title', '/title', 'h1', 'h2', 'h3', '/h1', '/h2', '/h3']); - - const onTextNode = (text, cutCounter, cutTag) => {// eslint-disable-line no-unused-vars - if (!cutCounter) { - growParagraph(text); - } - - if (inTitle && !title) - title = text; - }; - - const onStartNode = (tag, tail, singleTag, cutCounter, cutTag) => {// eslint-disable-line no-unused-vars - if (!cutCounter) { - if (newPara.has(tag)) - newParagraph(); - } - - if (tag == 'title') - inTitle = true; - }; - - const onEndNode = (tag, tail, singleTag, cutCounter, cutTag) => {// eslint-disable-line no-unused-vars - if (tag == 'title') - inTitle = false; - }; - - let buf = this.decode(data).toString(); - - sax.parseSync(buf, { - onStartNode, onEndNode, onTextNode, - innerCut: new Set(['head', 'script', 'style']) - }); - - titleInfo['book-title'] = title; - - //подозрение на чистый текст, надо разбить на параграфы - if (isText || pars.length < buf.length/2000) { - let total = 0; - for (let i = 0; i < spaceCounter.length; i++) { - total += (spaceCounter[i] ? spaceCounter[i] : 0); - } - total /= 10; - let i = spaceCounter.length - 1; - while (i > 0 && (!spaceCounter[i] || spaceCounter[i] < total)) i--; - - const parIndent = (i > 0 ? i : 0); - - let newPars = []; - const newPar = () => { - newPars.push({_n: 'p', _t: ''}); - }; - - const growPar = (text) => { - if (!newPars.length) - newPar(); - - const l = newPars.length; - newPars[l - 1]._t += text; - } - - i = 0; - for (const par of pars) { - if (i > 0) - newPar(); - i++; - - const lines = par._t.split('\n'); - for (let line of lines) { - line = repSpaces2(line).replace(/\t/g, ' '); - - let l = 0; - while (l < line.length && line[l] == ' ') { - l++; - } - - if (l >= parIndent) - newPar(); - growPar(line.trim() + ' '); - } - } - - body.section._a[0] = newPars; - } else { - body.section._a[0] = pars; - } - - //убираем лишнее - for (let i = 0; i < pars.length; i++) - pars[i]._t = repSpaces(pars[i]._t).trim(); - - return this.formatFb2(fb2); - } - - convertSamlib(data, hostname) { - let titleInfo = {}; - let desc = {_n: 'description', 'title-info': titleInfo}; - let pars = []; - let body = {_n: 'body', section: {_a: pars}}; - let fb2 = [desc, body]; - - let inSubtitle = false; - let inJustify = true; - let inImage = false; - let isFirstPara = false; - let path = ''; - let tag = '';// eslint-disable-line no-unused-vars - - let inText = false; - let textFound = false; - let node = {_a: pars}; - - let inPara = false; - let italic = false; - let bold = false; - - const openTag = (name, attrs) => { - if (name == 'p') - inPara = true; - let n = {_n: name, _attrs: attrs, _a: [], _p: node}; - node._a.push(n); - node = n; - }; - - const closeTag = (name) => { - if (name == 'p') - inPara = false; - if (node._p) { - const exact = (node._n == name); - node = node._p; - if (!exact) - closeTag(name); - } - }; - - const growParagraph = (text) => { - if (!node._p) { - if (text.trim() != '') - openTag('p'); - else - return; - } - if (node._n == 'p' && node._a.length == 0) - text = text.trimLeft(); - node._a.push({_t: text}); - }; - - const onStartNode = (elemName, tail, singleTag, cutCounter, cutTag) => {// eslint-disable-line no-unused-vars - if (elemName == '') - return; - if (!inText) { - path += '/' + elemName; - tag = elemName; - } else { - switch (elemName) { - case 'li': - case 'p': - case 'dd': - case 'br': - if (!(inSubtitle && isFirstPara)) { - if (inPara) - closeTag('p'); - openTag('p'); - } - isFirstPara = false; - break; - case 'h1': - case 'h2': - case 'h3': - if (inPara) - closeTag('p'); - openTag('p'); - bold = true; - break; - case 'i': - case 'em': - italic = true; - break; - case 'b': - case 'strong': - bold = true; - break; - case 'div': - if (inPara) - closeTag('p'); - if (tail.indexOf('align="center"') >= 0) { - openTag('subtitle'); - inSubtitle = true; - isFirstPara = true; - } - - if (tail.indexOf('align="justify"') >= 0) { - openTag('p'); - inJustify = true; - } - - break; - case 'img': { - if (inPara) - closeTag('p'); - const attrs = sax.getAttrsSync(tail); - if (attrs.src && attrs.src.value) { - let href = attrs.src.value; - if (href[0] == '/') - href = `http://${hostname}${href}`; - openTag('image', {href}); - inImage = true; - } - break; - } - } - } - }; - - const onEndNode = (elemName, tail, singleTag, cutCounter, cutTag) => {// eslint-disable-line no-unused-vars - if (!inText) { - const oldPath = path; - let t = ''; - do { - let i = path.lastIndexOf('/'); - t = path.substr(i + 1); - path = path.substr(0, i); - } while (t != elemName && path); - - if (t != elemName) { - path = oldPath; - } - - let i = path.lastIndexOf('/'); - tag = path.substr(i + 1); - } else { - switch (elemName) { - case 'li': - case 'p': - case 'dd': - closeTag('p'); - break; - case 'h1': - case 'h2': - case 'h3': - closeTag('p'); - bold = false; - break; - case 'i': - case 'em': - italic = false; - break; - case 'b': - case 'strong': - bold = false; - break; - case 'div': - if (inSubtitle) { - closeTag('subtitle'); - inSubtitle = false; - isFirstPara = false; - } - - if (inJustify) { - closeTag('p'); - inJustify = false; - } - break; - case 'img': - if (inImage) - closeTag('image'); - inImage = false; - break; - } - } - }; - - const onComment = (text) => {// eslint-disable-line no-unused-vars - if (text == '--------- Собственно произведение -------------') { - inText = true; - textFound = true; - } - if (text == '-----------------------------------------------') - inText = false; - }; - - const onTextNode = (text) => {// eslint-disable-line no-unused-vars - if (text && text.trim() == '') - text = (text.indexOf(' ') >= 0 ? ' ' : ''); - - if (!text) - return; - - switch (path) { - case '/html/body/center/h2': - titleInfo['book-title'] = text; - return; - case '/html/body/div/h3': - if (!titleInfo.author) - titleInfo.author = {}; - text = text.replace(':', '').trim().split(' '); - if (text[0]) - titleInfo.author['last-name'] = text[0]; - if (text[1]) - titleInfo.author['first-name'] = text[1]; - if (text[2]) - titleInfo.author['middle-name'] = text[2]; - return; - } - - let tOpen = (bold ? '' : ''); - tOpen += (italic ? '' : ''); - let tClose = (italic ? '' : ''); - tClose += (bold ? '' : ''); - - if (inText) - growParagraph(`${tOpen}${text}${tClose}`); - }; - - sax.parseSync(repSpaces3(this.decode(data).toString()), { - onStartNode, onEndNode, onTextNode, onComment, - innerCut: new Set(['head', 'script', 'style']) - }); - - //текст не найден на странице, обрабатываем как html - if (!textFound) - return this.convertHtml(data); - - const title = (titleInfo['book-title'] ? titleInfo['book-title'] : ''); - let author = ''; - if (titleInfo.author) { - author = _.compact([ - (titleInfo.author['last-name'] ? titleInfo.author['last-name'] : ''), - (titleInfo.author['first-name'] ? titleInfo.author['first-name'] : ''), - (titleInfo.author['middle-name'] ? titleInfo.author['middle-name'] : ''), - ]).join(' '); - } - - pars.unshift({_n: 'title', _a: [ - {_n: 'p', _t: author}, {_n: 'p', _t: ''}, - {_n: 'p', _t: title}, {_n: 'p', _t: ''}, - ]}) - - return this.formatFb2(fb2); - } - - formatFb2(fb2) { - let out = ''; - out += ''; - out += this.formatFb2Node(fb2); - out += ''; - return out; - } - - formatFb2Node(node, name) { - let out = ''; - - if (Array.isArray(node)) { - for (const n of node) { - out += this.formatFb2Node(n); - } - } else if (typeof node == 'string') { - if (name) - out += `<${name}>${repSpaces(node)}`; - else - out += repSpaces(node); - } else { - if (node._n) - name = node._n; - - let attrs = ''; - if (node._attrs) { - for (let attrName in node._attrs) { - attrs += ` ${attrName}="${node._attrs[attrName]}"`; - } - } - - let tOpen = ''; - let tBody = ''; - let tClose = ''; - if (name) - tOpen += `<${name}${attrs}>`; - if (node.hasOwnProperty('_t')) - tBody += repSpaces(node._t); - - for (let nodeName in node) { - if (nodeName && nodeName[0] == '_' && nodeName != '_a') - continue; - - const n = node[nodeName]; - tBody += this.formatFb2Node(n, nodeName); - } - - if (name) - tClose += ``; - - if (attrs == '' && name == 'p' && tBody.trim() == '') - out += '' - else - out += `${tOpen}${tBody}${tClose}`; - } - return out; + callback(100); } } diff --git a/server/core/FileDetector.js b/server/core/FileDetector.js index 806389c3..692465aa 100644 --- a/server/core/FileDetector.js +++ b/server/core/FileDetector.js @@ -9,18 +9,18 @@ detect.addSignature( "rules": [ { "type": "or", "rules": [ - { "type": "contains", "bytes": "3c68746d6c" }, - { "type": "contains", "bytes": "3c00680074006d006c00" }, + { "type": "equal", "end": 5, "bytes": "3c68746d6c" }, + { "type": "equal", "end": 10, "bytes": "3c00680074006d006c00" }, - { "type": "contains", "bytes": "3c21646f6374797065" }, - { "type": "contains", "bytes": "3c626f6479" }, - { "type": "contains", "bytes": "3c68656164" }, - { "type": "contains", "bytes": "3c696672616d65" }, - { "type": "contains", "bytes": "3c696d67" }, - { "type": "contains", "bytes": "3c6f626a656374" }, - { "type": "contains", "bytes": "3c736372697074" }, - { "type": "contains", "bytes": "3c7461626c65" }, - { "type": "contains", "bytes": "3c7469746c65" }, + { "type": "equal", "end": 9, "bytes": "3c21646f6374797065" }, + { "type": "equal", "end": 5, "bytes": "3c626f6479" }, + { "type": "equal", "end": 5, "bytes": "3c68656164" }, + { "type": "equal", "end": 7, "bytes": "3c696672616d65" }, + { "type": "equal", "end": 4, "bytes": "3c696d67" }, + { "type": "equal", "end": 7, "bytes": "3c6f626a656374" }, + { "type": "equal", "end": 7, "bytes": "3c736372697074" }, + { "type": "equal", "end": 6, "bytes": "3c7461626c65" }, + { "type": "equal", "end": 6, "bytes": "3c7469746c65" }, ] } ] @@ -36,7 +36,7 @@ detect.addSignature( "rules": [ { "type": "or", "rules": [ - { "type": "contains", "bytes": "3c3f786d6c2076657273696f6e3d22312e3022" }, + { "type": "equal", "end": 19, "bytes": "3c3f786d6c2076657273696f6e3d22312e3022" }, ] } ] diff --git a/server/core/ReaderWorker.js b/server/core/ReaderWorker.js index a087da8c..5692f999 100644 --- a/server/core/ReaderWorker.js +++ b/server/core/ReaderWorker.js @@ -22,7 +22,7 @@ class ReaderWorker { this.down = new FileDownloader(); this.decomp = new FileDecompressor(); - this.bookConverter = new BookConverter(); + this.bookConverter = new BookConverter(this.config); if (!singleCleanExecute) { this.periodicCleanDir(this.config.tempPublicDir, this.config.maxTempPublicDirSize, 60*60*1000);//1 раз в час @@ -66,14 +66,14 @@ class ReaderWorker { const decompFilename = await this.decomp.decompressFile(downloadedFilename, decompDir); wState.set({progress: 100}); - //parse book + //конвертирование в fb2 wState.set({state: 'convert', step: 3, progress: 0}); convertFilename = `${this.config.tempDownloadDir}/${tempFilename2}`; await this.bookConverter.convertToFb2(decompFilename, convertFilename, url, progress => { wState.set({progress}); }); - //compress file to tmp dir, if not exists with the same hashname + //сжимаем файл в tmp, если там уже нет с тем же именем-sha256 const compFilename = await this.decomp.gzipFileIfNotExists(convertFilename, `${this.config.tempPublicDir}`); wState.set({progress: 100});