Работа над XmlParser

This commit is contained in:
Book Pauk
2022-11-07 19:52:29 +07:00
parent 02f276ca6b
commit a40d9e25b0
4 changed files with 905 additions and 0 deletions

View File

@@ -0,0 +1,67 @@
const fs = require('fs-extra');
const iconv = require('iconv-lite');
const textUtils = require('./textUtils');
const xmlParser = require('./xmlParser');
const utils = require('../utils');
class Fb2Parser {
checkEncoding(data) {
//Корректируем кодировку UTF-16
let encoding = textUtils.getEncoding(data);
if (encoding.indexOf('UTF-16') == 0) {
data = Buffer.from(iconv.decode(data, encoding));
encoding = 'utf-8';
}
//Корректируем пробелы, всякие файлы попадаются :(
if (data[0] == 32) {
data = Buffer.from(data.toString().trim());
}
//Окончательно корректируем кодировку
let result = data;
let left = data.indexOf('<?xml version="1.0"');
if (left < 0) {
left = data.indexOf('<?xml version=\'1.0\'');
}
if (left >= 0) {
const right = data.indexOf('?>', left);
if (right >= 0) {
const head = data.slice(left, right + 2).toString();
const m = head.match(/encoding=['"](.*?)['"]/);
if (m) {
let enc = m[1].toLowerCase();
if (enc != 'utf-8') {
//enc может не соответсвовать реальной кодировке файла, поэтому:
if (encoding.indexOf('ISO-8859') >= 0) {
encoding = enc;
}
result = iconv.decode(data, encoding);
result = Buffer.from(result.toString().replace(m[0], `encoding="utf-8"`));
}
}
}
}
return result;
}
async getDescAndCover(bookFile) {
let data = await fs.readFile(bookFile);
data = await utils.gunzipBuffer(data);
//data = this.checkEncoding(data);
const result = xmlParser.parseXml(data.toString(), true, (route) => {
console.log(route);
return true;
});
return xmlParser.simplifyXmlParsed(result);
}
}
module.exports = Fb2Parser;

View File

@@ -0,0 +1,342 @@
//node types
const NODE = 1;
const TEXT = 2;
const CDATA = 3;
const COMMENT = 4;
const name2type = {
'NODE': NODE,
'TEXT': TEXT,
'CDATA': CDATA,
'COMMENT': COMMENT,
};
const type2name = {
[NODE]: 'NODE',
[TEXT]: 'TEXT',
[CDATA]: 'CDATA',
[COMMENT]: 'COMMENT',
};
class NodeBase {
makeSelectorObj(selectorString) {
const result = {all: false, before: false, type: 0, name: ''};
if (selectorString === '') {
result.before = true;
} else if (selectorString === '*') {
result.all = true;
} else if (selectorString[0] === '*') {
const typeName = selectorString.substring(1);
result.type = name2type[typeName];
if (!result.type)
throw new Error(`Unknown selector type: ${typeName}`);
} else {
result.name = selectorString;
}
return result;
}
checkNode(rawNode, selectorObj) {
return selectorObj.all || selectorObj.before
|| (selectorObj.type && rawNode[0] === selectorObj.type)
|| (rawNode[0] === NODE && rawNode[1] === selectorObj.name);
}
findNodeIndex(nodes, selectorObj) {
for (let i = 0; i < nodes.length; i++)
if (this.checkNode(nodes[i], selectorObj))
return i;
}
rawAdd(nodes, rawNode, selectorObj) {
if (selectorObj.all) {
nodes.push(rawNode);
} else if (selectorObj.before) {
nodes.unshift(rawNode);
} else {
const index = this.findNodeIndex(nodes, selectorObj);
if (index >= 0)
nodes.splice(index, 0, rawNode);
else
nodes.push(rawNode);
}
}
rawRemove(nodes, selectorObj) {
if (selectorObj.before)
return;
for (let i = nodes.length - 1; i >= 0; i--) {
if (this.checkNode(nodes[i], selectorObj))
nodes.splice(i, 1);
}
}
}
class NodeObject extends NodeBase {
constructor(rawNode) {
super();
if (rawNode)
this.raw = rawNode;
else
this.raw = [];
}
get type() {
return this.raw[0] || null;
}
get name() {
if (this.type === NODE)
return this.raw[1] || null;
return null;
}
set name(value) {
if (this.type === NODE)
this.raw[1] = value;
}
get attrs() {
if (this.type === NODE && Array.isArray(this.raw[2]))
return new Map(this.raw[2]);
return null;
}
set attrs(value) {
if (this.type === NODE)
if (value && value.size)
this.raw[2] = Array.from(value);
else
this.raw[2] = null;
}
get value() {
switch (this.type) {
case NODE:
return this.raw[3] || null;
case TEXT:
case CDATA:
case COMMENT:
return this.raw[1] || null;
}
return null;
}
add(node, after = '*') {
if (this.type !== NODE)
return;
const selectorObj = this.makeSelectorObj(after);
if (!Array.isArray(this.raw[3]))
this.raw[3] = [];
this.rawAdd(this.raw[3], node.raw, selectorObj);
}
remove(selector = '') {
if (this.type !== NODE || !this.raw[3])
return;
const selectorObj = this.makeSelectorObj(selector);
this.rawRemove(this.raw[3], selectorObj);
if (!this.raw[3].length)
this.raw[3] = null;
}
each(callback) {
if (this.type !== NODE || !this.raw[3])
return;
for (const n of this.raw[3]) {
callback(new NodeObject(n));
}
}
}
class XmlParser extends NodeBase {
constructor(rawNodes = []) {
super();
this.NODE = NODE;
this.TEXT = TEXT;
this.CDATA = CDATA;
this.COMMENT = COMMENT;
this.rawNodes = rawNodes;
}
get count() {
return this.rawNodes.length;
}
toObject(node) {
return new NodeObject(node);
}
newParser(nodes) {
return new XmlParser(nodes);
}
checkType(type) {
if (!type2name[type])
throw new Error(`Invalid type: ${type}`);
}
createTypedNode(type, nameOrValue, attrs = null, value = null) {
this.checkType(type);
switch (type) {
case NODE:
if (!nameOrValue || typeof(nameOrValue) !== 'string')
throw new Error('Node name must be non-empty string');
return new NodeObject([type, nameOrValue, attrs, value]);
case TEXT:
case CDATA:
case COMMENT:
if (typeof(nameOrValue) !== 'string')
throw new Error('Node value must be of type string');
return new NodeObject([type, nameOrValue]);
}
}
createNode(name, attrs = null, value = null) {
return this.createTypedNode(NODE, name, attrs, value);
}
createText(value = null) {
return this.createTypedNode(TEXT, value);
}
createCdata(value = null) {
return this.createTypedNode(CDATA, value);
}
createComment(value = null) {
return this.createTypedNode(COMMENT, value);
}
add(node, after = '*') {
const selectorObj = this.makeSelectorObj(after);
for (const n of this.rawNodes) {
if (n && n[0] === NODE) {
if (!Array.isArray(n[3]))
n[3] = [];
this.rawAdd(n[3], node.raw, selectorObj);
}
}
}
addRoot(node, after = '*') {
const selectorObj = this.makeSelectorObj(after);
this.rawAdd(this.rawNodes, node.raw, selectorObj);
}
remove(selector = '') {
const selectorObj = this.makeSelectorObj(selector);
for (const n of this.rawNodes) {
if (n && n[0] === NODE && Array.isArray(n[3])) {
this.rawRemove(n[3], selectorObj);
if (!n[3].length)
n[3] = null;
}
}
}
removeRoot(selector = '') {
const selectorObj = this.makeSelectorObj(selector);
this.rawRemove(this.rawNodes, selectorObj);
}
each(callback) {
for (const n of this.rawNodes) {
callback(new NodeObject(n));
}
}
rawSelect(nodes, selectorObj, callback) {
for (const n of nodes)
if (this.checkNode(n, selectorObj))
callback(n);
}
select(selector = '', self = false) {
let newRawNodes = [];
if (selector.indexOf('/') >= 0) {
const selectors = selector.split('/');
let res = this;
for (const sel of selectors) {
res = res.select(sel, self);
self = false;
}
newRawNodes = res.rawNodes;
} else {
const selectorObj = this.makeSelectorObj(selector);
if (self) {
this.rawSelect(this.rawNodes, selectorObj, (node) => {
newRawNodes.push(node);
})
} else {
for (const n of this.rawNodes) {
if (n && n[0] === NODE && Array.isArray(n[3])) {
this.rawSelect(n[3], selectorObj, (node) => {
newRawNodes.push(node);
})
}
}
}
}
return new XmlParser(newRawNodes);
}
s(selector, self) {
return this.select(selector, self);
}
selectFirst(selector, self) {
const result = this.select(selector, self);
const node = (result.count ? result.rawNodes[0] : null);
return this.toObject(node);
}
sf(selector, self) {
return this.selectFirst(selector, self);
}
toJson(format = false) {
if (format)
return JSON.stringify(this.rawNodes, null, 2);
else
return JSON.stringify(this.rawNodes);
}
fromJson(jsonString) {
const parsed = JSON.parse(jsonString);
if (!Array.isArray(parsed))
throw new Error('JSON parse error: root element must be array');
this.rawNodes = parsed;
}
toString() {
}
fromSrtring() {
}
}
module.exports = XmlParser;

366
server/core/xml/sax.js Normal file
View File

@@ -0,0 +1,366 @@
function parseSync(xstr, options) {
const dummy = () => {};
let {onStartNode: _onStartNode = dummy,
onEndNode: _onEndNode = dummy,
onTextNode: _onTextNode = dummy,
onCdata: _onCdata = dummy,
onComment: _onComment = dummy,
onProgress: _onProgress = dummy,
innerCut = new Set(),
lowerCase = true,
} = options;
let i = 0;
const len = xstr.length;
const progStep = len/20;
let nextProg = 0;
let cutCounter = 0;
let cutTag = '';
let inCdata;
let inComment;
let leftData = 0;
while (i < len) {
inCdata = false;
inComment = false;
let singleTag = false;
let left = xstr.indexOf('<', i);
if (left < 0)
break;
leftData = left;
if (left < len - 2 && xstr[left + 1] == '!') {
if (xstr[left + 2] == '-') {
const leftComment = xstr.indexOf('<!--', left);
if (leftComment == left) {
inComment = true;
leftData = left + 3;
}
}
if (!inComment && xstr[left + 2] == '[') {
const leftCdata = xstr.indexOf('<![CDATA[', left);
if (leftCdata == left) {
inCdata = true;
leftData = left + 8;
}
}
}
if (left != i) {
const text = xstr.substr(i, left - i);
_onTextNode(text, cutCounter, cutTag);
}
let right = null;
let rightData = null;
if (inCdata) {
rightData = xstr.indexOf(']]>', leftData + 1);
if (rightData < 0)
break;
right = rightData + 2;
} else if (inComment) {
rightData = xstr.indexOf('-->', leftData + 1);
if (rightData < 0)
break;
right = rightData + 2;
} else {
rightData = xstr.indexOf('>', leftData + 1);
if (rightData < 0)
break;
right = rightData;
if (xstr[right - 1] === '/') {
singleTag = true;
rightData--;
}
}
let tagData = xstr.substr(leftData + 1, rightData - leftData - 1);
if (inCdata) {
_onCdata(tagData, cutCounter, cutTag);
} else if (inComment) {
_onComment(tagData, cutCounter, cutTag);
} else {
let tag = '';
let tail = '';
const firstSpace = tagData.indexOf(' ');
if (firstSpace >= 0) {
tail = tagData.substr(firstSpace);
tag = tagData.substr(0, firstSpace);
} else {
tag = tagData;
}
if (lowerCase)
tag = tag.toLowerCase();
if (innerCut.has(tag) && (!cutCounter || cutTag === tag)) {
if (!cutCounter)
cutTag = tag;
cutCounter++;
}
let endTag = (singleTag ? tag : '');
if (tag === '' || tag[0] !== '/') {
_onStartNode(tag, tail, singleTag, cutCounter, cutTag);
} else {
endTag = tag.substr(1);
}
if (endTag)
_onEndNode(endTag, tail, singleTag, cutCounter, cutTag);
if (cutTag === endTag) {
cutCounter = (cutCounter > 0 ? cutCounter - 1 : 0);
if (!cutCounter)
cutTag = '';
}
}
if (right >= nextProg) {
_onProgress(Math.round(right/(len + 1)*100));
nextProg += progStep;
}
i = right + 1;
}
if (i < len) {
if (inCdata) {
_onCdata(xstr.substr(leftData + 1, len - leftData - 1), cutCounter, cutTag);
} else if (inComment) {
_onComment(xstr.substr(leftData + 1, len - leftData - 1), cutCounter, cutTag);
} else {
_onTextNode(xstr.substr(i, len - i), cutCounter, cutTag);
}
}
_onProgress(100);
}
//асинхронная копия parseSync
//делается заменой "_on" => "await _on" после while
async function parse(xstr, options) {
const dummy = () => {};
let {onStartNode: _onStartNode = dummy,
onEndNode: _onEndNode = dummy,
onTextNode: _onTextNode = dummy,
onCdata: _onCdata = dummy,
onComment: _onComment = dummy,
onProgress: _onProgress = dummy,
innerCut = new Set(),
lowerCase = true,
} = options;
let i = 0;
const len = xstr.length;
const progStep = len/20;
let nextProg = 0;
let cutCounter = 0;
let cutTag = '';
let inCdata;
let inComment;
let leftData = 0;
while (i < len) {
inCdata = false;
inComment = false;
let singleTag = false;
let left = xstr.indexOf('<', i);
if (left < 0)
break;
leftData = left;
if (left < len - 2 && xstr[left + 1] == '!') {
if (xstr[left + 2] == '-') {
const leftComment = xstr.indexOf('<!--', left);
if (leftComment == left) {
inComment = true;
leftData = left + 3;
}
}
if (!inComment && xstr[left + 2] == '[') {
const leftCdata = xstr.indexOf('<![CDATA[', left);
if (leftCdata == left) {
inCdata = true;
leftData = left + 8;
}
}
}
if (left != i) {
const text = xstr.substr(i, left - i);
await _onTextNode(text, cutCounter, cutTag);
}
let right = null;
let rightData = null;
if (inCdata) {
rightData = xstr.indexOf(']]>', leftData + 1);
if (rightData < 0)
break;
right = rightData + 2;
} else if (inComment) {
rightData = xstr.indexOf('-->', leftData + 1);
if (rightData < 0)
break;
right = rightData + 2;
} else {
rightData = xstr.indexOf('>', leftData + 1);
if (rightData < 0)
break;
right = rightData;
if (xstr[right - 1] === '/') {
singleTag = true;
rightData--;
}
}
let tagData = xstr.substr(leftData + 1, rightData - leftData - 1);
if (inCdata) {
await _onCdata(tagData, cutCounter, cutTag);
} else if (inComment) {
await _onComment(tagData, cutCounter, cutTag);
} else {
let tag = '';
let tail = '';
const firstSpace = tagData.indexOf(' ');
if (firstSpace >= 0) {
tail = tagData.substr(firstSpace);
tag = tagData.substr(0, firstSpace);
} else {
tag = tagData;
}
if (lowerCase)
tag = tag.toLowerCase();
if (innerCut.has(tag) && (!cutCounter || cutTag === tag)) {
if (!cutCounter)
cutTag = tag;
cutCounter++;
}
let endTag = (singleTag ? tag : '');
if (tag === '' || tag[0] !== '/') {
await _onStartNode(tag, tail, singleTag, cutCounter, cutTag);
} else {
endTag = tag.substr(1);
}
if (endTag)
await _onEndNode(endTag, tail, singleTag, cutCounter, cutTag);
if (cutTag === endTag) {
cutCounter = (cutCounter > 0 ? cutCounter - 1 : 0);
if (!cutCounter)
cutTag = '';
}
}
if (right >= nextProg) {
await _onProgress(Math.round(right/(len + 1)*100));
nextProg += progStep;
}
i = right + 1;
}
if (i < len) {
if (inCdata) {
await _onCdata(xstr.substr(leftData + 1, len - leftData - 1), cutCounter, cutTag);
} else if (inComment) {
await _onComment(xstr.substr(leftData + 1, len - leftData - 1), cutCounter, cutTag);
} else {
await _onTextNode(xstr.substr(i, len - i), cutCounter, cutTag);
}
}
await _onProgress(100);
}
function getAttrsSync(tail, lowerCase = true) {
let result = {};
let name = '';
let value = '';
let vOpen = '';
let inName = false;
let inValue = false;
let waitValue = false;
let waitEq = false;
const pushResult = () => {
if (lowerCase)
name = name.toLowerCase();
if (name != '') {
const fn = name;
let ns = '';
if (fn.indexOf(':') >= 0) {
[ns, name] = fn.split(':');
}
result[name] = {value, ns, fn};
}
name = '';
value = '';
vOpen = '';
inName = false;
inValue = false;
waitValue = false;
waitEq = false;
};
tail = tail.replace(/[\t\n\r]/g, ' ');
for (let i = 0; i < tail.length; i++) {
const c = tail.charAt(i);
if (c == ' ') {
if (inValue) {
if (vOpen == '"')
value += c;
else
pushResult();
} else if (inName) {
waitEq = true;
inName = false;
}
} else if (!inValue && c == '=') {
waitEq = false;
waitValue = true;
inName = false;
} else if (c == '"') {
if (inValue) {
pushResult();
} else if (waitValue) {
inValue = true;
vOpen = '"';
}
} else if (inValue) {
value += c;
} else if (inName) {
name += c;
} else if (waitEq) {
pushResult();
inName = true;
name = c;
} else if (waitValue) {
waitValue = false;
inValue = true;
vOpen = ' ';
value = c;
} else {
inName = true;
name = c;
}
}
if (name != '')
pushResult();
return result;
}
module.exports = {
parseSync,
getAttrsSync,
parse
}

View File

@@ -0,0 +1,130 @@
const chardet = require('chardet');
function getEncoding(buf) {
let selected = getEncodingLite(buf);
if (selected == 'ISO-8859-5' && buf.length > 10) {
const charsetAll = chardet.analyse(buf.slice(0, 20000));
for (const charset of charsetAll) {
if (charset.name.indexOf('ISO-8859') < 0) {
selected = charset.name;
break;
}
}
}
return selected;
}
function getEncodingLite(buf, returnAll) {
const lowerCase = 3;
const upperCase = 1;
const codePage = {
'k': 'koi8-r',
'w': 'Windows-1251',
'd': 'cp866',
'i': 'ISO-8859-5',
'm': 'maccyrillic',
'u': 'utf-8',
};
let charsets = {
'k': 0,
'w': 0,
'd': 0,
'i': 0,
'm': 0,
'u': 0,
};
const len = buf.length;
const blockSize = (len > 5*3000 ? 3000 : len);
let counter = 0;
let i = 0;
let totalChecked = 0;
while (i < len) {
const char = buf[i];
const nextChar = (i < len - 1 ? buf[i + 1] : 0);
totalChecked++;
i++;
//non-russian characters
if (char < 128 || char > 256)
continue;
//UTF-8
if ((char == 208 || char == 209) && nextChar >= 128 && nextChar <= 190)
charsets['u'] += lowerCase;
else {
//CP866
if ((char > 159 && char < 176) || (char > 223 && char < 242)) charsets['d'] += lowerCase;
if ((char > 127 && char < 160)) charsets['d'] += upperCase;
//KOI8-R
if ((char > 191 && char < 223)) charsets['k'] += lowerCase;
if ((char > 222 && char < 256)) charsets['k'] += upperCase;
//WIN-1251
if (char > 223 && char < 256) charsets['w'] += lowerCase;
if (char > 191 && char < 224) charsets['w'] += upperCase;
//MAC
if (char > 221 && char < 255) charsets['m'] += lowerCase;
if (char > 127 && char < 160) charsets['m'] += upperCase;
//ISO-8859-5
if (char > 207 && char < 240) charsets['i'] += lowerCase;
if (char > 175 && char < 208) charsets['i'] += upperCase;
}
counter++;
if (counter > blockSize) {
counter = 0;
i += Math.round(len/2 - 2*blockSize);
}
}
let sorted = Object.keys(charsets).map(function(key) {
return { codePage: codePage[key], c: charsets[key], totalChecked };
});
sorted.sort((a, b) => b.c - a.c);
if (returnAll)
return sorted;
else if (sorted[0].c > 0 && sorted[0].c > sorted[0].totalChecked/2)
return sorted[0].codePage;
else
return 'ISO-8859-5';
}
function checkIfText(buf) {
const enc = getEncodingLite(buf, true);
if (enc[0].c > enc[0].totalChecked*0.9)
return true;
let spaceCount = 0;
let crCount = 0;
let lfCount = 0;
for (let i = 0; i < buf.length; i++) {
if (buf[i] == 32)
spaceCount++;
if (buf[i] == 13)
crCount++;
if (buf[i] == 10)
lfCount++;
}
const spaceFreq = spaceCount/(buf.length + 1);
const crFreq = crCount/(buf.length + 1);
const lfFreq = lfCount/(buf.length + 1);
return (buf.length < 1000 || spaceFreq > 0.1 || crFreq > 0.03 || lfFreq > 0.03);
}
module.exports = {
getEncoding,
getEncodingLite,
checkIfText,
}