From a40d9e25b04b70263bc8ed5283d7ab4ccf2868c0 Mon Sep 17 00:00:00 2001 From: Book Pauk Date: Mon, 7 Nov 2022 19:52:29 +0700 Subject: [PATCH] =?UTF-8?q?=D0=A0=D0=B0=D0=B1=D0=BE=D1=82=D0=B0=20=D0=BD?= =?UTF-8?q?=D0=B0=D0=B4=20XmlParser?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- server/core/xml/Fb2Parser.js | 67 +++++++ server/core/xml/XmlParser.js | 342 ++++++++++++++++++++++++++++++++ server/core/xml/sax.js | 366 +++++++++++++++++++++++++++++++++++ server/core/xml/textUtils.js | 130 +++++++++++++ 4 files changed, 905 insertions(+) create mode 100644 server/core/xml/Fb2Parser.js create mode 100644 server/core/xml/XmlParser.js create mode 100644 server/core/xml/sax.js create mode 100644 server/core/xml/textUtils.js diff --git a/server/core/xml/Fb2Parser.js b/server/core/xml/Fb2Parser.js new file mode 100644 index 0000000..05383e9 --- /dev/null +++ b/server/core/xml/Fb2Parser.js @@ -0,0 +1,67 @@ +const fs = require('fs-extra'); +const iconv = require('iconv-lite'); +const textUtils = require('./textUtils'); + +const xmlParser = require('./xmlParser'); +const utils = require('../utils'); + +class Fb2Parser { + checkEncoding(data) { + //Корректируем кодировку UTF-16 + let encoding = textUtils.getEncoding(data); + if (encoding.indexOf('UTF-16') == 0) { + data = Buffer.from(iconv.decode(data, encoding)); + encoding = 'utf-8'; + } + + //Корректируем пробелы, всякие файлы попадаются :( + if (data[0] == 32) { + data = Buffer.from(data.toString().trim()); + } + + //Окончательно корректируем кодировку + let result = data; + + let left = data.indexOf('= 0) { + const right = data.indexOf('?>', left); + if (right >= 0) { + const head = data.slice(left, right + 2).toString(); + const m = head.match(/encoding=['"](.*?)['"]/); + if (m) { + let enc = m[1].toLowerCase(); + if (enc != 'utf-8') { + //enc может не соответсвовать реальной кодировке файла, поэтому: + if (encoding.indexOf('ISO-8859') >= 0) { + encoding = enc; + } + + result = iconv.decode(data, encoding); + result = Buffer.from(result.toString().replace(m[0], `encoding="utf-8"`)); + } + } + } + } + + return result; + } + + async getDescAndCover(bookFile) { + let data = await fs.readFile(bookFile); + data = await utils.gunzipBuffer(data); + //data = this.checkEncoding(data); + + const result = xmlParser.parseXml(data.toString(), true, (route) => { + console.log(route); + return true; + }); + + return xmlParser.simplifyXmlParsed(result); + } +} + +module.exports = Fb2Parser; \ No newline at end of file diff --git a/server/core/xml/XmlParser.js b/server/core/xml/XmlParser.js new file mode 100644 index 0000000..e1c3a0d --- /dev/null +++ b/server/core/xml/XmlParser.js @@ -0,0 +1,342 @@ +//node types +const NODE = 1; +const TEXT = 2; +const CDATA = 3; +const COMMENT = 4; + +const name2type = { + 'NODE': NODE, + 'TEXT': TEXT, + 'CDATA': CDATA, + 'COMMENT': COMMENT, +}; + +const type2name = { + [NODE]: 'NODE', + [TEXT]: 'TEXT', + [CDATA]: 'CDATA', + [COMMENT]: 'COMMENT', +}; + +class NodeBase { + makeSelectorObj(selectorString) { + const result = {all: false, before: false, type: 0, name: ''}; + + if (selectorString === '') { + result.before = true; + } else if (selectorString === '*') { + result.all = true; + } else if (selectorString[0] === '*') { + const typeName = selectorString.substring(1); + result.type = name2type[typeName]; + if (!result.type) + throw new Error(`Unknown selector type: ${typeName}`); + } else { + result.name = selectorString; + } + + return result; + } + + checkNode(rawNode, selectorObj) { + return selectorObj.all || selectorObj.before + || (selectorObj.type && rawNode[0] === selectorObj.type) + || (rawNode[0] === NODE && rawNode[1] === selectorObj.name); + } + + findNodeIndex(nodes, selectorObj) { + for (let i = 0; i < nodes.length; i++) + if (this.checkNode(nodes[i], selectorObj)) + return i; + } + + rawAdd(nodes, rawNode, selectorObj) { + if (selectorObj.all) { + nodes.push(rawNode); + } else if (selectorObj.before) { + nodes.unshift(rawNode); + } else { + const index = this.findNodeIndex(nodes, selectorObj); + if (index >= 0) + nodes.splice(index, 0, rawNode); + else + nodes.push(rawNode); + } + } + + rawRemove(nodes, selectorObj) { + if (selectorObj.before) + return; + + for (let i = nodes.length - 1; i >= 0; i--) { + if (this.checkNode(nodes[i], selectorObj)) + nodes.splice(i, 1); + } + } +} + +class NodeObject extends NodeBase { + constructor(rawNode) { + super(); + + if (rawNode) + this.raw = rawNode; + else + this.raw = []; + } + + get type() { + return this.raw[0] || null; + } + + get name() { + if (this.type === NODE) + return this.raw[1] || null; + + return null; + } + + set name(value) { + if (this.type === NODE) + this.raw[1] = value; + } + + get attrs() { + if (this.type === NODE && Array.isArray(this.raw[2])) + return new Map(this.raw[2]); + + return null; + } + + set attrs(value) { + if (this.type === NODE) + if (value && value.size) + this.raw[2] = Array.from(value); + else + this.raw[2] = null; + } + + get value() { + switch (this.type) { + case NODE: + return this.raw[3] || null; + case TEXT: + case CDATA: + case COMMENT: + return this.raw[1] || null; + } + + return null; + } + + add(node, after = '*') { + if (this.type !== NODE) + return; + + const selectorObj = this.makeSelectorObj(after); + + if (!Array.isArray(this.raw[3])) + this.raw[3] = []; + this.rawAdd(this.raw[3], node.raw, selectorObj); + } + + remove(selector = '') { + if (this.type !== NODE || !this.raw[3]) + return; + + const selectorObj = this.makeSelectorObj(selector); + + this.rawRemove(this.raw[3], selectorObj); + if (!this.raw[3].length) + this.raw[3] = null; + } + + each(callback) { + if (this.type !== NODE || !this.raw[3]) + return; + + for (const n of this.raw[3]) { + callback(new NodeObject(n)); + } + } +} + +class XmlParser extends NodeBase { + constructor(rawNodes = []) { + super(); + + this.NODE = NODE; + this.TEXT = TEXT; + this.CDATA = CDATA; + this.COMMENT = COMMENT; + + this.rawNodes = rawNodes; + } + + get count() { + return this.rawNodes.length; + } + + toObject(node) { + return new NodeObject(node); + } + + newParser(nodes) { + return new XmlParser(nodes); + } + + checkType(type) { + if (!type2name[type]) + throw new Error(`Invalid type: ${type}`); + } + + createTypedNode(type, nameOrValue, attrs = null, value = null) { + this.checkType(type); + switch (type) { + case NODE: + if (!nameOrValue || typeof(nameOrValue) !== 'string') + throw new Error('Node name must be non-empty string'); + return new NodeObject([type, nameOrValue, attrs, value]); + case TEXT: + case CDATA: + case COMMENT: + if (typeof(nameOrValue) !== 'string') + throw new Error('Node value must be of type string'); + return new NodeObject([type, nameOrValue]); + } + } + + createNode(name, attrs = null, value = null) { + return this.createTypedNode(NODE, name, attrs, value); + } + + createText(value = null) { + return this.createTypedNode(TEXT, value); + } + + createCdata(value = null) { + return this.createTypedNode(CDATA, value); + } + + createComment(value = null) { + return this.createTypedNode(COMMENT, value); + } + + add(node, after = '*') { + const selectorObj = this.makeSelectorObj(after); + + for (const n of this.rawNodes) { + if (n && n[0] === NODE) { + if (!Array.isArray(n[3])) + n[3] = []; + this.rawAdd(n[3], node.raw, selectorObj); + } + } + } + + addRoot(node, after = '*') { + const selectorObj = this.makeSelectorObj(after); + + this.rawAdd(this.rawNodes, node.raw, selectorObj); + } + + remove(selector = '') { + const selectorObj = this.makeSelectorObj(selector); + + for (const n of this.rawNodes) { + if (n && n[0] === NODE && Array.isArray(n[3])) { + this.rawRemove(n[3], selectorObj); + if (!n[3].length) + n[3] = null; + } + } + } + + removeRoot(selector = '') { + const selectorObj = this.makeSelectorObj(selector); + + this.rawRemove(this.rawNodes, selectorObj); + } + + each(callback) { + for (const n of this.rawNodes) { + callback(new NodeObject(n)); + } + } + + rawSelect(nodes, selectorObj, callback) { + for (const n of nodes) + if (this.checkNode(n, selectorObj)) + callback(n); + } + + select(selector = '', self = false) { + let newRawNodes = []; + + if (selector.indexOf('/') >= 0) { + const selectors = selector.split('/'); + let res = this; + for (const sel of selectors) { + res = res.select(sel, self); + self = false; + } + + newRawNodes = res.rawNodes; + } else { + const selectorObj = this.makeSelectorObj(selector); + + if (self) { + this.rawSelect(this.rawNodes, selectorObj, (node) => { + newRawNodes.push(node); + }) + } else { + for (const n of this.rawNodes) { + if (n && n[0] === NODE && Array.isArray(n[3])) { + this.rawSelect(n[3], selectorObj, (node) => { + newRawNodes.push(node); + }) + } + } + } + } + + return new XmlParser(newRawNodes); + } + + s(selector, self) { + return this.select(selector, self); + } + + selectFirst(selector, self) { + const result = this.select(selector, self); + const node = (result.count ? result.rawNodes[0] : null); + return this.toObject(node); + } + + sf(selector, self) { + return this.selectFirst(selector, self); + } + + toJson(format = false) { + if (format) + return JSON.stringify(this.rawNodes, null, 2); + else + return JSON.stringify(this.rawNodes); + } + + fromJson(jsonString) { + const parsed = JSON.parse(jsonString); + if (!Array.isArray(parsed)) + throw new Error('JSON parse error: root element must be array'); + + this.rawNodes = parsed; + } + + toString() { + } + + fromSrtring() { + } +} + +module.exports = XmlParser; \ No newline at end of file diff --git a/server/core/xml/sax.js b/server/core/xml/sax.js new file mode 100644 index 0000000..07b5a55 --- /dev/null +++ b/server/core/xml/sax.js @@ -0,0 +1,366 @@ +function parseSync(xstr, options) { + const dummy = () => {}; + let {onStartNode: _onStartNode = dummy, + onEndNode: _onEndNode = dummy, + onTextNode: _onTextNode = dummy, + onCdata: _onCdata = dummy, + onComment: _onComment = dummy, + onProgress: _onProgress = dummy, + innerCut = new Set(), + lowerCase = true, + } = options; + + let i = 0; + const len = xstr.length; + const progStep = len/20; + let nextProg = 0; + + let cutCounter = 0; + let cutTag = ''; + let inCdata; + let inComment; + let leftData = 0; + while (i < len) { + inCdata = false; + inComment = false; + let singleTag = false; + + let left = xstr.indexOf('<', i); + if (left < 0) + break; + leftData = left; + + if (left < len - 2 && xstr[left + 1] == '!') { + if (xstr[left + 2] == '-') { + const leftComment = xstr.indexOf('', leftData + 1); + if (rightData < 0) + break; + right = rightData + 2; + } else { + rightData = xstr.indexOf('>', leftData + 1); + if (rightData < 0) + break; + right = rightData; + if (xstr[right - 1] === '/') { + singleTag = true; + rightData--; + } + } + + let tagData = xstr.substr(leftData + 1, rightData - leftData - 1); + + if (inCdata) { + _onCdata(tagData, cutCounter, cutTag); + } else if (inComment) { + _onComment(tagData, cutCounter, cutTag); + } else { + let tag = ''; + let tail = ''; + const firstSpace = tagData.indexOf(' '); + if (firstSpace >= 0) { + tail = tagData.substr(firstSpace); + tag = tagData.substr(0, firstSpace); + } else { + tag = tagData; + } + if (lowerCase) + tag = tag.toLowerCase(); + + if (innerCut.has(tag) && (!cutCounter || cutTag === tag)) { + if (!cutCounter) + cutTag = tag; + cutCounter++; + } + + let endTag = (singleTag ? tag : ''); + if (tag === '' || tag[0] !== '/') { + _onStartNode(tag, tail, singleTag, cutCounter, cutTag); + } else { + endTag = tag.substr(1); + } + + if (endTag) + _onEndNode(endTag, tail, singleTag, cutCounter, cutTag); + + if (cutTag === endTag) { + cutCounter = (cutCounter > 0 ? cutCounter - 1 : 0); + if (!cutCounter) + cutTag = ''; + } + } + + if (right >= nextProg) { + _onProgress(Math.round(right/(len + 1)*100)); + nextProg += progStep; + } + i = right + 1; + } + + if (i < len) { + if (inCdata) { + _onCdata(xstr.substr(leftData + 1, len - leftData - 1), cutCounter, cutTag); + } else if (inComment) { + _onComment(xstr.substr(leftData + 1, len - leftData - 1), cutCounter, cutTag); + } else { + _onTextNode(xstr.substr(i, len - i), cutCounter, cutTag); + } + } + + _onProgress(100); +} + +//асинхронная копия parseSync +//делается заменой "_on" => "await _on" после while +async function parse(xstr, options) { + const dummy = () => {}; + let {onStartNode: _onStartNode = dummy, + onEndNode: _onEndNode = dummy, + onTextNode: _onTextNode = dummy, + onCdata: _onCdata = dummy, + onComment: _onComment = dummy, + onProgress: _onProgress = dummy, + innerCut = new Set(), + lowerCase = true, + } = options; + + let i = 0; + const len = xstr.length; + const progStep = len/20; + let nextProg = 0; + + let cutCounter = 0; + let cutTag = ''; + let inCdata; + let inComment; + let leftData = 0; + while (i < len) { + inCdata = false; + inComment = false; + let singleTag = false; + + let left = xstr.indexOf('<', i); + if (left < 0) + break; + leftData = left; + + if (left < len - 2 && xstr[left + 1] == '!') { + if (xstr[left + 2] == '-') { + const leftComment = xstr.indexOf('', leftData + 1); + if (rightData < 0) + break; + right = rightData + 2; + } else { + rightData = xstr.indexOf('>', leftData + 1); + if (rightData < 0) + break; + right = rightData; + if (xstr[right - 1] === '/') { + singleTag = true; + rightData--; + } + } + + let tagData = xstr.substr(leftData + 1, rightData - leftData - 1); + + if (inCdata) { + await _onCdata(tagData, cutCounter, cutTag); + } else if (inComment) { + await _onComment(tagData, cutCounter, cutTag); + } else { + let tag = ''; + let tail = ''; + const firstSpace = tagData.indexOf(' '); + if (firstSpace >= 0) { + tail = tagData.substr(firstSpace); + tag = tagData.substr(0, firstSpace); + } else { + tag = tagData; + } + if (lowerCase) + tag = tag.toLowerCase(); + + if (innerCut.has(tag) && (!cutCounter || cutTag === tag)) { + if (!cutCounter) + cutTag = tag; + cutCounter++; + } + + let endTag = (singleTag ? tag : ''); + if (tag === '' || tag[0] !== '/') { + await _onStartNode(tag, tail, singleTag, cutCounter, cutTag); + } else { + endTag = tag.substr(1); + } + + if (endTag) + await _onEndNode(endTag, tail, singleTag, cutCounter, cutTag); + + if (cutTag === endTag) { + cutCounter = (cutCounter > 0 ? cutCounter - 1 : 0); + if (!cutCounter) + cutTag = ''; + } + } + + if (right >= nextProg) { + await _onProgress(Math.round(right/(len + 1)*100)); + nextProg += progStep; + } + i = right + 1; + } + + if (i < len) { + if (inCdata) { + await _onCdata(xstr.substr(leftData + 1, len - leftData - 1), cutCounter, cutTag); + } else if (inComment) { + await _onComment(xstr.substr(leftData + 1, len - leftData - 1), cutCounter, cutTag); + } else { + await _onTextNode(xstr.substr(i, len - i), cutCounter, cutTag); + } + } + + await _onProgress(100); +} + +function getAttrsSync(tail, lowerCase = true) { + let result = {}; + let name = ''; + let value = ''; + let vOpen = ''; + let inName = false; + let inValue = false; + let waitValue = false; + let waitEq = false; + + const pushResult = () => { + if (lowerCase) + name = name.toLowerCase(); + if (name != '') { + const fn = name; + let ns = ''; + if (fn.indexOf(':') >= 0) { + [ns, name] = fn.split(':'); + } + + result[name] = {value, ns, fn}; + } + name = ''; + value = ''; + vOpen = ''; + inName = false; + inValue = false; + waitValue = false; + waitEq = false; + }; + + tail = tail.replace(/[\t\n\r]/g, ' '); + for (let i = 0; i < tail.length; i++) { + const c = tail.charAt(i); + if (c == ' ') { + if (inValue) { + if (vOpen == '"') + value += c; + else + pushResult(); + } else if (inName) { + waitEq = true; + inName = false; + } + } else if (!inValue && c == '=') { + waitEq = false; + waitValue = true; + inName = false; + } else if (c == '"') { + if (inValue) { + pushResult(); + } else if (waitValue) { + inValue = true; + vOpen = '"'; + } + } else if (inValue) { + value += c; + } else if (inName) { + name += c; + } else if (waitEq) { + pushResult(); + inName = true; + name = c; + } else if (waitValue) { + waitValue = false; + inValue = true; + vOpen = ' '; + value = c; + } else { + inName = true; + name = c; + } + } + if (name != '') + pushResult(); + + return result; +} + +module.exports = { + parseSync, + getAttrsSync, + parse +} \ No newline at end of file diff --git a/server/core/xml/textUtils.js b/server/core/xml/textUtils.js new file mode 100644 index 0000000..ef05606 --- /dev/null +++ b/server/core/xml/textUtils.js @@ -0,0 +1,130 @@ +const chardet = require('chardet'); + +function getEncoding(buf) { + let selected = getEncodingLite(buf); + + if (selected == 'ISO-8859-5' && buf.length > 10) { + const charsetAll = chardet.analyse(buf.slice(0, 20000)); + for (const charset of charsetAll) { + if (charset.name.indexOf('ISO-8859') < 0) { + selected = charset.name; + break; + } + } + } + + return selected; +} + + +function getEncodingLite(buf, returnAll) { + const lowerCase = 3; + const upperCase = 1; + + const codePage = { + 'k': 'koi8-r', + 'w': 'Windows-1251', + 'd': 'cp866', + 'i': 'ISO-8859-5', + 'm': 'maccyrillic', + 'u': 'utf-8', + }; + + let charsets = { + 'k': 0, + 'w': 0, + 'd': 0, + 'i': 0, + 'm': 0, + 'u': 0, + }; + + const len = buf.length; + const blockSize = (len > 5*3000 ? 3000 : len); + let counter = 0; + let i = 0; + let totalChecked = 0; + while (i < len) { + const char = buf[i]; + const nextChar = (i < len - 1 ? buf[i + 1] : 0); + totalChecked++; + i++; + //non-russian characters + if (char < 128 || char > 256) + continue; + //UTF-8 + if ((char == 208 || char == 209) && nextChar >= 128 && nextChar <= 190) + charsets['u'] += lowerCase; + else { + //CP866 + if ((char > 159 && char < 176) || (char > 223 && char < 242)) charsets['d'] += lowerCase; + if ((char > 127 && char < 160)) charsets['d'] += upperCase; + + //KOI8-R + if ((char > 191 && char < 223)) charsets['k'] += lowerCase; + if ((char > 222 && char < 256)) charsets['k'] += upperCase; + + //WIN-1251 + if (char > 223 && char < 256) charsets['w'] += lowerCase; + if (char > 191 && char < 224) charsets['w'] += upperCase; + + //MAC + if (char > 221 && char < 255) charsets['m'] += lowerCase; + if (char > 127 && char < 160) charsets['m'] += upperCase; + + //ISO-8859-5 + if (char > 207 && char < 240) charsets['i'] += lowerCase; + if (char > 175 && char < 208) charsets['i'] += upperCase; + } + + counter++; + + if (counter > blockSize) { + counter = 0; + i += Math.round(len/2 - 2*blockSize); + } + } + + let sorted = Object.keys(charsets).map(function(key) { + return { codePage: codePage[key], c: charsets[key], totalChecked }; + }); + + sorted.sort((a, b) => b.c - a.c); + + if (returnAll) + return sorted; + else if (sorted[0].c > 0 && sorted[0].c > sorted[0].totalChecked/2) + return sorted[0].codePage; + else + return 'ISO-8859-5'; +} + +function checkIfText(buf) { + const enc = getEncodingLite(buf, true); + if (enc[0].c > enc[0].totalChecked*0.9) + return true; + + let spaceCount = 0; + let crCount = 0; + let lfCount = 0; + for (let i = 0; i < buf.length; i++) { + if (buf[i] == 32) + spaceCount++; + if (buf[i] == 13) + crCount++; + if (buf[i] == 10) + lfCount++; + } + + const spaceFreq = spaceCount/(buf.length + 1); + const crFreq = crCount/(buf.length + 1); + const lfFreq = lfCount/(buf.length + 1); + + return (buf.length < 1000 || spaceFreq > 0.1 || crFreq > 0.03 || lfFreq > 0.03); +} + +module.exports = { + getEncoding, + getEncodingLite, + checkIfText, +} \ No newline at end of file