diff --git a/server/core/BookConverter/index.js b/server/core/BookConverter/index.js index 981b4fda..bda48687 100644 --- a/server/core/BookConverter/index.js +++ b/server/core/BookConverter/index.js @@ -3,6 +3,7 @@ const URL = require('url').URL; const iconv = require('iconv-lite'); const chardet = require('chardet'); const _ = require('lodash'); +const sax = require('./sax'); const FileDetector = require('../FileDetector'); @@ -54,60 +55,6 @@ class BookConverter { return iconv.decode(data, selected); } - parseHtml(buf, onNode, onText, innerCut) { - if (!onNode) - onNode = () => {}; - if (!onText) - onText = () => {}; - if (!innerCut) - innerCut = new Set(); - - buf = buf.replace(/ /g, ' '); - - let i = 0; - const len = buf.length; - let cutCounter = 0; - let cutTag = ''; - while (i < len) { - let left = buf.indexOf('<', i); - if (left < 0) - break; - let right = buf.indexOf('>', left + 1); - if (right < 0) - break; - - let tag = buf.substr(left + 1, right - left - 1).trim().toLowerCase(); - let tail = ''; - const firstSpace = tag.indexOf(' '); - if (firstSpace >= 0) { - tail = tag.substr(firstSpace); - tag = tag.substr(0, firstSpace); - } - - const text = buf.substr(i, left - i); - - onText(text, cutCounter, cutTag); - onNode(tag, tail, cutCounter, cutTag); - - if (innerCut.has(tag) && (!cutCounter || cutTag == tag)) { - if (!cutCounter) - cutTag = tag; - cutCounter++; - } - - if (tag != '' && tag.charAt(0) == '/' && cutTag == tag.substr(1)) { - cutCounter = (cutCounter > 0 ? cutCounter - 1 : 0); - if (!cutCounter) - cutTag = ''; - } - - i = right + 1; - } - - if (i < len) - onText(buf.substr(i, len - i), cutCounter, cutTag); - } - convertHtml(data, isText) { let titleInfo = {}; let desc = {_n: 'description', 'title-info': titleInfo}; @@ -267,75 +214,75 @@ class BookConverter { newParagraph(); - const onNode = (elemName, tail, cutCounter, cutTag) => {// eslint-disable-line no-unused-vars + const onStartNode = (elemName, tail, left, cutCounter, cutTag) => {// eslint-disable-line no-unused-vars if (elemName == '') return; - if (elemName[0] == '!') {//comment - const text = elemName + tail; - if (text == '!----------- собственно произведение ---------------') - inText = true; - if (text == '!---------------------------------------------------') - inText = false; - } else if (elemName[0] != '/') {//open tag - if (!inText) { - path += '/' + elemName; - tag = elemName; - } else { - if (!inSubtitle && (elemName == 'p' || elemName == 'dd')) { - newParagraph(); - } - - switch (elemName) { - case 'i': - openTag('emphasis'); - break; - case 'b': - openTag('strong'); - break; - case 'div': - if (tail.indexOf('align="center"') >= 0) { - openTag('subtitle'); - inSubtitle = true; - } - break; - } + if (!inText) { + path += '/' + elemName; + tag = elemName; + } else { + if (!inSubtitle && (elemName == 'p' || elemName == 'dd')) { + newParagraph(); } - } else if (elemName[0] == '/') {//close tag - elemName = elemName.substr(1); - if (!inText) { - const oldPath = path; - let t = ''; - do { - let i = path.lastIndexOf('/'); - t = path.substr(i + 1); - path = path.substr(0, i); - } while (t != elemName && path); - if (t != elemName) { - path = oldPath; - } - - let i = path.lastIndexOf('/'); - tag = path.substr(i + 1); - } else { - switch (elemName) { - case 'i': - closeTag('emphasis'); - break; - case 'b': - closeTag('strong'); - break; - case 'div': - if (inSubtitle) - closeTag('subtitle'); - inSubtitle = false; - break; - } + switch (elemName) { + case 'i': + openTag('emphasis'); + break; + case 'b': + openTag('strong'); + break; + case 'div': + if (tail.indexOf('align="center"') >= 0) { + openTag('subtitle'); + inSubtitle = true; + } + break; } } }; - const onText = (text, cutCounter, cutTag) => {// eslint-disable-line no-unused-vars + const onEndNode = (elemName, tail, left, cutCounter, cutTag) => {// eslint-disable-line no-unused-vars + if (!inText) { + const oldPath = path; + let t = ''; + do { + let i = path.lastIndexOf('/'); + t = path.substr(i + 1); + path = path.substr(0, i); + } while (t != elemName && path); + + if (t != elemName) { + path = oldPath; + } + + let i = path.lastIndexOf('/'); + tag = path.substr(i + 1); + } else { + switch (elemName) { + case 'i': + closeTag('emphasis'); + break; + case 'b': + closeTag('strong'); + break; + case 'div': + if (inSubtitle) + closeTag('subtitle'); + inSubtitle = false; + break; + } + } + }; + + const onComment = (text, left, cutCounter, cutTag) => {// eslint-disable-line no-unused-vars + if (text == '--------- Собственно произведение -------------') + inText = true; + if (text == '-----------------------------------------------') + inText = false; + }; + + const onTextNode = (text, left, cutCounter, cutTag) => {// eslint-disable-line no-unused-vars if (text != ' ' && text.trim() == '') text = text.trim(); @@ -363,8 +310,10 @@ class BookConverter { growParagraph(text); }; - this.parseHtml(this.decode(data).toString(), - onNode, onText, new Set(['head', 'script', 'style'])); + sax.parse(this.decode(data).toString(), { + onStartNode, onEndNode, onTextNode, onComment, + innerCut: new Set(['head', 'script', 'style']) + }); const title = (titleInfo['book-title'] ? titleInfo['book-title'] : ''); let author = ''; @@ -395,7 +344,7 @@ class BookConverter { formatFb2Node(node, name) { let out = ''; - const repl = (text) => text.replace(/[\t\n\r]/g, ' '); + const repl = (text) => text.replace(/ |[\t\n\r]/g, ' '); if (Array.isArray(node)) { for (const n of node) { diff --git a/server/core/BookConverter/sax.js b/server/core/BookConverter/sax.js new file mode 100644 index 00000000..83b05dcc --- /dev/null +++ b/server/core/BookConverter/sax.js @@ -0,0 +1,130 @@ +function parse(xstr, options) { + let {onStartNode, onEndNode, onTextNode, onCdata, onComment, innerCut} = options; + + if (!onStartNode) + onStartNode = () => {}; + if (!onEndNode) + onEndNode = () => {}; + if (!onTextNode) + onTextNode = () => {}; + if (!onCdata) + onCdata = () => {}; + if (!onComment) + onComment = () => {}; + + if (!innerCut) + innerCut = new Set(); + + let i = 0; + const len = xstr.length; + let cutCounter = 0; + let cutTag = ''; + let inCdata; + let inComment; + while (i < len) { + inCdata = false; + inComment = false; + + let left = xstr.indexOf('<', i); + if (left < 0) + break; + let leftData = left; + + if (left < len - 2 && xstr[left + 1] == '!') { + if (xstr[left + 2] == '-') { + const leftComment = xstr.indexOf('', leftData + 1); + if (rightData < 0) + break; + right = rightData + 2; + } else { + rightData = xstr.indexOf('>', leftData + 1); + if (rightData < 0) + break; + right = rightData; + } + + let tagData = xstr.substr(leftData + 1, rightData - leftData - 1); + + if (inCdata) { + onCdata(tagData, left, cutCounter, cutTag); + } else if (inComment) { + onComment(tagData, left, cutCounter, cutTag); + } else { + let tag = ''; + let tail = ''; + const firstSpace = tagData.indexOf(' '); + if (firstSpace >= 0) { + tail = tagData.substr(firstSpace); + tag = tagData.substr(0, firstSpace); + } else { + tag = tagData; + } + tag = tag.toLowerCase(); + + const text = xstr.substr(i, left - i); + + onTextNode(text, left, cutCounter, cutTag); + + let endTag = ''; + if (tag === '' || tag[0] !== '/') { + onStartNode(tag, tail, left, cutCounter, cutTag); + } else { + endTag = tag.substr(1); + onEndNode(endTag, tail, left, cutCounter, cutTag); + } + + if (innerCut.has(tag) && (!cutCounter || cutTag === tag)) { + if (!cutCounter) + cutTag = tag; + cutCounter++; + } + + if (cutTag === endTag) { + cutCounter = (cutCounter > 0 ? cutCounter - 1 : 0); + if (!cutCounter) + cutTag = ''; + } + } + + i = right + 1; + } + + if (i < len) { + if (inCdata) { + onCdata(xstr.substr(i, len - i), len - 1, cutCounter, cutTag); + } else if (inComment) { + onComment(xstr.substr(i, len - i), len - 1, cutCounter, cutTag); + } else { + onTextNode(xstr.substr(i, len - i), len - 1, cutCounter, cutTag); + } + } +} + + +module.exports = { + parse +} \ No newline at end of file