Работа над XmlParser
This commit is contained in:
67
server/core/xml/Fb2Parser.js
Normal file
67
server/core/xml/Fb2Parser.js
Normal file
@@ -0,0 +1,67 @@
|
||||
const fs = require('fs-extra');
|
||||
const iconv = require('iconv-lite');
|
||||
const textUtils = require('./textUtils');
|
||||
|
||||
const xmlParser = require('./xmlParser');
|
||||
const utils = require('../utils');
|
||||
|
||||
class Fb2Parser {
|
||||
checkEncoding(data) {
|
||||
//Корректируем кодировку UTF-16
|
||||
let encoding = textUtils.getEncoding(data);
|
||||
if (encoding.indexOf('UTF-16') == 0) {
|
||||
data = Buffer.from(iconv.decode(data, encoding));
|
||||
encoding = 'utf-8';
|
||||
}
|
||||
|
||||
//Корректируем пробелы, всякие файлы попадаются :(
|
||||
if (data[0] == 32) {
|
||||
data = Buffer.from(data.toString().trim());
|
||||
}
|
||||
|
||||
//Окончательно корректируем кодировку
|
||||
let result = data;
|
||||
|
||||
let left = data.indexOf('<?xml version="1.0"');
|
||||
if (left < 0) {
|
||||
left = data.indexOf('<?xml version=\'1.0\'');
|
||||
}
|
||||
|
||||
if (left >= 0) {
|
||||
const right = data.indexOf('?>', left);
|
||||
if (right >= 0) {
|
||||
const head = data.slice(left, right + 2).toString();
|
||||
const m = head.match(/encoding=['"](.*?)['"]/);
|
||||
if (m) {
|
||||
let enc = m[1].toLowerCase();
|
||||
if (enc != 'utf-8') {
|
||||
//enc может не соответсвовать реальной кодировке файла, поэтому:
|
||||
if (encoding.indexOf('ISO-8859') >= 0) {
|
||||
encoding = enc;
|
||||
}
|
||||
|
||||
result = iconv.decode(data, encoding);
|
||||
result = Buffer.from(result.toString().replace(m[0], `encoding="utf-8"`));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
async getDescAndCover(bookFile) {
|
||||
let data = await fs.readFile(bookFile);
|
||||
data = await utils.gunzipBuffer(data);
|
||||
//data = this.checkEncoding(data);
|
||||
|
||||
const result = xmlParser.parseXml(data.toString(), true, (route) => {
|
||||
console.log(route);
|
||||
return true;
|
||||
});
|
||||
|
||||
return xmlParser.simplifyXmlParsed(result);
|
||||
}
|
||||
}
|
||||
|
||||
module.exports = Fb2Parser;
|
||||
342
server/core/xml/XmlParser.js
Normal file
342
server/core/xml/XmlParser.js
Normal file
@@ -0,0 +1,342 @@
|
||||
//node types
|
||||
const NODE = 1;
|
||||
const TEXT = 2;
|
||||
const CDATA = 3;
|
||||
const COMMENT = 4;
|
||||
|
||||
const name2type = {
|
||||
'NODE': NODE,
|
||||
'TEXT': TEXT,
|
||||
'CDATA': CDATA,
|
||||
'COMMENT': COMMENT,
|
||||
};
|
||||
|
||||
const type2name = {
|
||||
[NODE]: 'NODE',
|
||||
[TEXT]: 'TEXT',
|
||||
[CDATA]: 'CDATA',
|
||||
[COMMENT]: 'COMMENT',
|
||||
};
|
||||
|
||||
class NodeBase {
|
||||
makeSelectorObj(selectorString) {
|
||||
const result = {all: false, before: false, type: 0, name: ''};
|
||||
|
||||
if (selectorString === '') {
|
||||
result.before = true;
|
||||
} else if (selectorString === '*') {
|
||||
result.all = true;
|
||||
} else if (selectorString[0] === '*') {
|
||||
const typeName = selectorString.substring(1);
|
||||
result.type = name2type[typeName];
|
||||
if (!result.type)
|
||||
throw new Error(`Unknown selector type: ${typeName}`);
|
||||
} else {
|
||||
result.name = selectorString;
|
||||
}
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
checkNode(rawNode, selectorObj) {
|
||||
return selectorObj.all || selectorObj.before
|
||||
|| (selectorObj.type && rawNode[0] === selectorObj.type)
|
||||
|| (rawNode[0] === NODE && rawNode[1] === selectorObj.name);
|
||||
}
|
||||
|
||||
findNodeIndex(nodes, selectorObj) {
|
||||
for (let i = 0; i < nodes.length; i++)
|
||||
if (this.checkNode(nodes[i], selectorObj))
|
||||
return i;
|
||||
}
|
||||
|
||||
rawAdd(nodes, rawNode, selectorObj) {
|
||||
if (selectorObj.all) {
|
||||
nodes.push(rawNode);
|
||||
} else if (selectorObj.before) {
|
||||
nodes.unshift(rawNode);
|
||||
} else {
|
||||
const index = this.findNodeIndex(nodes, selectorObj);
|
||||
if (index >= 0)
|
||||
nodes.splice(index, 0, rawNode);
|
||||
else
|
||||
nodes.push(rawNode);
|
||||
}
|
||||
}
|
||||
|
||||
rawRemove(nodes, selectorObj) {
|
||||
if (selectorObj.before)
|
||||
return;
|
||||
|
||||
for (let i = nodes.length - 1; i >= 0; i--) {
|
||||
if (this.checkNode(nodes[i], selectorObj))
|
||||
nodes.splice(i, 1);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
class NodeObject extends NodeBase {
|
||||
constructor(rawNode) {
|
||||
super();
|
||||
|
||||
if (rawNode)
|
||||
this.raw = rawNode;
|
||||
else
|
||||
this.raw = [];
|
||||
}
|
||||
|
||||
get type() {
|
||||
return this.raw[0] || null;
|
||||
}
|
||||
|
||||
get name() {
|
||||
if (this.type === NODE)
|
||||
return this.raw[1] || null;
|
||||
|
||||
return null;
|
||||
}
|
||||
|
||||
set name(value) {
|
||||
if (this.type === NODE)
|
||||
this.raw[1] = value;
|
||||
}
|
||||
|
||||
get attrs() {
|
||||
if (this.type === NODE && Array.isArray(this.raw[2]))
|
||||
return new Map(this.raw[2]);
|
||||
|
||||
return null;
|
||||
}
|
||||
|
||||
set attrs(value) {
|
||||
if (this.type === NODE)
|
||||
if (value && value.size)
|
||||
this.raw[2] = Array.from(value);
|
||||
else
|
||||
this.raw[2] = null;
|
||||
}
|
||||
|
||||
get value() {
|
||||
switch (this.type) {
|
||||
case NODE:
|
||||
return this.raw[3] || null;
|
||||
case TEXT:
|
||||
case CDATA:
|
||||
case COMMENT:
|
||||
return this.raw[1] || null;
|
||||
}
|
||||
|
||||
return null;
|
||||
}
|
||||
|
||||
add(node, after = '*') {
|
||||
if (this.type !== NODE)
|
||||
return;
|
||||
|
||||
const selectorObj = this.makeSelectorObj(after);
|
||||
|
||||
if (!Array.isArray(this.raw[3]))
|
||||
this.raw[3] = [];
|
||||
this.rawAdd(this.raw[3], node.raw, selectorObj);
|
||||
}
|
||||
|
||||
remove(selector = '') {
|
||||
if (this.type !== NODE || !this.raw[3])
|
||||
return;
|
||||
|
||||
const selectorObj = this.makeSelectorObj(selector);
|
||||
|
||||
this.rawRemove(this.raw[3], selectorObj);
|
||||
if (!this.raw[3].length)
|
||||
this.raw[3] = null;
|
||||
}
|
||||
|
||||
each(callback) {
|
||||
if (this.type !== NODE || !this.raw[3])
|
||||
return;
|
||||
|
||||
for (const n of this.raw[3]) {
|
||||
callback(new NodeObject(n));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
class XmlParser extends NodeBase {
|
||||
constructor(rawNodes = []) {
|
||||
super();
|
||||
|
||||
this.NODE = NODE;
|
||||
this.TEXT = TEXT;
|
||||
this.CDATA = CDATA;
|
||||
this.COMMENT = COMMENT;
|
||||
|
||||
this.rawNodes = rawNodes;
|
||||
}
|
||||
|
||||
get count() {
|
||||
return this.rawNodes.length;
|
||||
}
|
||||
|
||||
toObject(node) {
|
||||
return new NodeObject(node);
|
||||
}
|
||||
|
||||
newParser(nodes) {
|
||||
return new XmlParser(nodes);
|
||||
}
|
||||
|
||||
checkType(type) {
|
||||
if (!type2name[type])
|
||||
throw new Error(`Invalid type: ${type}`);
|
||||
}
|
||||
|
||||
createTypedNode(type, nameOrValue, attrs = null, value = null) {
|
||||
this.checkType(type);
|
||||
switch (type) {
|
||||
case NODE:
|
||||
if (!nameOrValue || typeof(nameOrValue) !== 'string')
|
||||
throw new Error('Node name must be non-empty string');
|
||||
return new NodeObject([type, nameOrValue, attrs, value]);
|
||||
case TEXT:
|
||||
case CDATA:
|
||||
case COMMENT:
|
||||
if (typeof(nameOrValue) !== 'string')
|
||||
throw new Error('Node value must be of type string');
|
||||
return new NodeObject([type, nameOrValue]);
|
||||
}
|
||||
}
|
||||
|
||||
createNode(name, attrs = null, value = null) {
|
||||
return this.createTypedNode(NODE, name, attrs, value);
|
||||
}
|
||||
|
||||
createText(value = null) {
|
||||
return this.createTypedNode(TEXT, value);
|
||||
}
|
||||
|
||||
createCdata(value = null) {
|
||||
return this.createTypedNode(CDATA, value);
|
||||
}
|
||||
|
||||
createComment(value = null) {
|
||||
return this.createTypedNode(COMMENT, value);
|
||||
}
|
||||
|
||||
add(node, after = '*') {
|
||||
const selectorObj = this.makeSelectorObj(after);
|
||||
|
||||
for (const n of this.rawNodes) {
|
||||
if (n && n[0] === NODE) {
|
||||
if (!Array.isArray(n[3]))
|
||||
n[3] = [];
|
||||
this.rawAdd(n[3], node.raw, selectorObj);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
addRoot(node, after = '*') {
|
||||
const selectorObj = this.makeSelectorObj(after);
|
||||
|
||||
this.rawAdd(this.rawNodes, node.raw, selectorObj);
|
||||
}
|
||||
|
||||
remove(selector = '') {
|
||||
const selectorObj = this.makeSelectorObj(selector);
|
||||
|
||||
for (const n of this.rawNodes) {
|
||||
if (n && n[0] === NODE && Array.isArray(n[3])) {
|
||||
this.rawRemove(n[3], selectorObj);
|
||||
if (!n[3].length)
|
||||
n[3] = null;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
removeRoot(selector = '') {
|
||||
const selectorObj = this.makeSelectorObj(selector);
|
||||
|
||||
this.rawRemove(this.rawNodes, selectorObj);
|
||||
}
|
||||
|
||||
each(callback) {
|
||||
for (const n of this.rawNodes) {
|
||||
callback(new NodeObject(n));
|
||||
}
|
||||
}
|
||||
|
||||
rawSelect(nodes, selectorObj, callback) {
|
||||
for (const n of nodes)
|
||||
if (this.checkNode(n, selectorObj))
|
||||
callback(n);
|
||||
}
|
||||
|
||||
select(selector = '', self = false) {
|
||||
let newRawNodes = [];
|
||||
|
||||
if (selector.indexOf('/') >= 0) {
|
||||
const selectors = selector.split('/');
|
||||
let res = this;
|
||||
for (const sel of selectors) {
|
||||
res = res.select(sel, self);
|
||||
self = false;
|
||||
}
|
||||
|
||||
newRawNodes = res.rawNodes;
|
||||
} else {
|
||||
const selectorObj = this.makeSelectorObj(selector);
|
||||
|
||||
if (self) {
|
||||
this.rawSelect(this.rawNodes, selectorObj, (node) => {
|
||||
newRawNodes.push(node);
|
||||
})
|
||||
} else {
|
||||
for (const n of this.rawNodes) {
|
||||
if (n && n[0] === NODE && Array.isArray(n[3])) {
|
||||
this.rawSelect(n[3], selectorObj, (node) => {
|
||||
newRawNodes.push(node);
|
||||
})
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return new XmlParser(newRawNodes);
|
||||
}
|
||||
|
||||
s(selector, self) {
|
||||
return this.select(selector, self);
|
||||
}
|
||||
|
||||
selectFirst(selector, self) {
|
||||
const result = this.select(selector, self);
|
||||
const node = (result.count ? result.rawNodes[0] : null);
|
||||
return this.toObject(node);
|
||||
}
|
||||
|
||||
sf(selector, self) {
|
||||
return this.selectFirst(selector, self);
|
||||
}
|
||||
|
||||
toJson(format = false) {
|
||||
if (format)
|
||||
return JSON.stringify(this.rawNodes, null, 2);
|
||||
else
|
||||
return JSON.stringify(this.rawNodes);
|
||||
}
|
||||
|
||||
fromJson(jsonString) {
|
||||
const parsed = JSON.parse(jsonString);
|
||||
if (!Array.isArray(parsed))
|
||||
throw new Error('JSON parse error: root element must be array');
|
||||
|
||||
this.rawNodes = parsed;
|
||||
}
|
||||
|
||||
toString() {
|
||||
}
|
||||
|
||||
fromSrtring() {
|
||||
}
|
||||
}
|
||||
|
||||
module.exports = XmlParser;
|
||||
366
server/core/xml/sax.js
Normal file
366
server/core/xml/sax.js
Normal file
@@ -0,0 +1,366 @@
|
||||
function parseSync(xstr, options) {
|
||||
const dummy = () => {};
|
||||
let {onStartNode: _onStartNode = dummy,
|
||||
onEndNode: _onEndNode = dummy,
|
||||
onTextNode: _onTextNode = dummy,
|
||||
onCdata: _onCdata = dummy,
|
||||
onComment: _onComment = dummy,
|
||||
onProgress: _onProgress = dummy,
|
||||
innerCut = new Set(),
|
||||
lowerCase = true,
|
||||
} = options;
|
||||
|
||||
let i = 0;
|
||||
const len = xstr.length;
|
||||
const progStep = len/20;
|
||||
let nextProg = 0;
|
||||
|
||||
let cutCounter = 0;
|
||||
let cutTag = '';
|
||||
let inCdata;
|
||||
let inComment;
|
||||
let leftData = 0;
|
||||
while (i < len) {
|
||||
inCdata = false;
|
||||
inComment = false;
|
||||
let singleTag = false;
|
||||
|
||||
let left = xstr.indexOf('<', i);
|
||||
if (left < 0)
|
||||
break;
|
||||
leftData = left;
|
||||
|
||||
if (left < len - 2 && xstr[left + 1] == '!') {
|
||||
if (xstr[left + 2] == '-') {
|
||||
const leftComment = xstr.indexOf('<!--', left);
|
||||
if (leftComment == left) {
|
||||
inComment = true;
|
||||
leftData = left + 3;
|
||||
}
|
||||
}
|
||||
|
||||
if (!inComment && xstr[left + 2] == '[') {
|
||||
const leftCdata = xstr.indexOf('<![CDATA[', left);
|
||||
if (leftCdata == left) {
|
||||
inCdata = true;
|
||||
leftData = left + 8;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (left != i) {
|
||||
const text = xstr.substr(i, left - i);
|
||||
_onTextNode(text, cutCounter, cutTag);
|
||||
}
|
||||
|
||||
let right = null;
|
||||
let rightData = null;
|
||||
if (inCdata) {
|
||||
rightData = xstr.indexOf(']]>', leftData + 1);
|
||||
if (rightData < 0)
|
||||
break;
|
||||
right = rightData + 2;
|
||||
} else if (inComment) {
|
||||
rightData = xstr.indexOf('-->', leftData + 1);
|
||||
if (rightData < 0)
|
||||
break;
|
||||
right = rightData + 2;
|
||||
} else {
|
||||
rightData = xstr.indexOf('>', leftData + 1);
|
||||
if (rightData < 0)
|
||||
break;
|
||||
right = rightData;
|
||||
if (xstr[right - 1] === '/') {
|
||||
singleTag = true;
|
||||
rightData--;
|
||||
}
|
||||
}
|
||||
|
||||
let tagData = xstr.substr(leftData + 1, rightData - leftData - 1);
|
||||
|
||||
if (inCdata) {
|
||||
_onCdata(tagData, cutCounter, cutTag);
|
||||
} else if (inComment) {
|
||||
_onComment(tagData, cutCounter, cutTag);
|
||||
} else {
|
||||
let tag = '';
|
||||
let tail = '';
|
||||
const firstSpace = tagData.indexOf(' ');
|
||||
if (firstSpace >= 0) {
|
||||
tail = tagData.substr(firstSpace);
|
||||
tag = tagData.substr(0, firstSpace);
|
||||
} else {
|
||||
tag = tagData;
|
||||
}
|
||||
if (lowerCase)
|
||||
tag = tag.toLowerCase();
|
||||
|
||||
if (innerCut.has(tag) && (!cutCounter || cutTag === tag)) {
|
||||
if (!cutCounter)
|
||||
cutTag = tag;
|
||||
cutCounter++;
|
||||
}
|
||||
|
||||
let endTag = (singleTag ? tag : '');
|
||||
if (tag === '' || tag[0] !== '/') {
|
||||
_onStartNode(tag, tail, singleTag, cutCounter, cutTag);
|
||||
} else {
|
||||
endTag = tag.substr(1);
|
||||
}
|
||||
|
||||
if (endTag)
|
||||
_onEndNode(endTag, tail, singleTag, cutCounter, cutTag);
|
||||
|
||||
if (cutTag === endTag) {
|
||||
cutCounter = (cutCounter > 0 ? cutCounter - 1 : 0);
|
||||
if (!cutCounter)
|
||||
cutTag = '';
|
||||
}
|
||||
}
|
||||
|
||||
if (right >= nextProg) {
|
||||
_onProgress(Math.round(right/(len + 1)*100));
|
||||
nextProg += progStep;
|
||||
}
|
||||
i = right + 1;
|
||||
}
|
||||
|
||||
if (i < len) {
|
||||
if (inCdata) {
|
||||
_onCdata(xstr.substr(leftData + 1, len - leftData - 1), cutCounter, cutTag);
|
||||
} else if (inComment) {
|
||||
_onComment(xstr.substr(leftData + 1, len - leftData - 1), cutCounter, cutTag);
|
||||
} else {
|
||||
_onTextNode(xstr.substr(i, len - i), cutCounter, cutTag);
|
||||
}
|
||||
}
|
||||
|
||||
_onProgress(100);
|
||||
}
|
||||
|
||||
//асинхронная копия parseSync
|
||||
//делается заменой "_on" => "await _on" после while
|
||||
async function parse(xstr, options) {
|
||||
const dummy = () => {};
|
||||
let {onStartNode: _onStartNode = dummy,
|
||||
onEndNode: _onEndNode = dummy,
|
||||
onTextNode: _onTextNode = dummy,
|
||||
onCdata: _onCdata = dummy,
|
||||
onComment: _onComment = dummy,
|
||||
onProgress: _onProgress = dummy,
|
||||
innerCut = new Set(),
|
||||
lowerCase = true,
|
||||
} = options;
|
||||
|
||||
let i = 0;
|
||||
const len = xstr.length;
|
||||
const progStep = len/20;
|
||||
let nextProg = 0;
|
||||
|
||||
let cutCounter = 0;
|
||||
let cutTag = '';
|
||||
let inCdata;
|
||||
let inComment;
|
||||
let leftData = 0;
|
||||
while (i < len) {
|
||||
inCdata = false;
|
||||
inComment = false;
|
||||
let singleTag = false;
|
||||
|
||||
let left = xstr.indexOf('<', i);
|
||||
if (left < 0)
|
||||
break;
|
||||
leftData = left;
|
||||
|
||||
if (left < len - 2 && xstr[left + 1] == '!') {
|
||||
if (xstr[left + 2] == '-') {
|
||||
const leftComment = xstr.indexOf('<!--', left);
|
||||
if (leftComment == left) {
|
||||
inComment = true;
|
||||
leftData = left + 3;
|
||||
}
|
||||
}
|
||||
|
||||
if (!inComment && xstr[left + 2] == '[') {
|
||||
const leftCdata = xstr.indexOf('<![CDATA[', left);
|
||||
if (leftCdata == left) {
|
||||
inCdata = true;
|
||||
leftData = left + 8;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (left != i) {
|
||||
const text = xstr.substr(i, left - i);
|
||||
await _onTextNode(text, cutCounter, cutTag);
|
||||
}
|
||||
|
||||
let right = null;
|
||||
let rightData = null;
|
||||
if (inCdata) {
|
||||
rightData = xstr.indexOf(']]>', leftData + 1);
|
||||
if (rightData < 0)
|
||||
break;
|
||||
right = rightData + 2;
|
||||
} else if (inComment) {
|
||||
rightData = xstr.indexOf('-->', leftData + 1);
|
||||
if (rightData < 0)
|
||||
break;
|
||||
right = rightData + 2;
|
||||
} else {
|
||||
rightData = xstr.indexOf('>', leftData + 1);
|
||||
if (rightData < 0)
|
||||
break;
|
||||
right = rightData;
|
||||
if (xstr[right - 1] === '/') {
|
||||
singleTag = true;
|
||||
rightData--;
|
||||
}
|
||||
}
|
||||
|
||||
let tagData = xstr.substr(leftData + 1, rightData - leftData - 1);
|
||||
|
||||
if (inCdata) {
|
||||
await _onCdata(tagData, cutCounter, cutTag);
|
||||
} else if (inComment) {
|
||||
await _onComment(tagData, cutCounter, cutTag);
|
||||
} else {
|
||||
let tag = '';
|
||||
let tail = '';
|
||||
const firstSpace = tagData.indexOf(' ');
|
||||
if (firstSpace >= 0) {
|
||||
tail = tagData.substr(firstSpace);
|
||||
tag = tagData.substr(0, firstSpace);
|
||||
} else {
|
||||
tag = tagData;
|
||||
}
|
||||
if (lowerCase)
|
||||
tag = tag.toLowerCase();
|
||||
|
||||
if (innerCut.has(tag) && (!cutCounter || cutTag === tag)) {
|
||||
if (!cutCounter)
|
||||
cutTag = tag;
|
||||
cutCounter++;
|
||||
}
|
||||
|
||||
let endTag = (singleTag ? tag : '');
|
||||
if (tag === '' || tag[0] !== '/') {
|
||||
await _onStartNode(tag, tail, singleTag, cutCounter, cutTag);
|
||||
} else {
|
||||
endTag = tag.substr(1);
|
||||
}
|
||||
|
||||
if (endTag)
|
||||
await _onEndNode(endTag, tail, singleTag, cutCounter, cutTag);
|
||||
|
||||
if (cutTag === endTag) {
|
||||
cutCounter = (cutCounter > 0 ? cutCounter - 1 : 0);
|
||||
if (!cutCounter)
|
||||
cutTag = '';
|
||||
}
|
||||
}
|
||||
|
||||
if (right >= nextProg) {
|
||||
await _onProgress(Math.round(right/(len + 1)*100));
|
||||
nextProg += progStep;
|
||||
}
|
||||
i = right + 1;
|
||||
}
|
||||
|
||||
if (i < len) {
|
||||
if (inCdata) {
|
||||
await _onCdata(xstr.substr(leftData + 1, len - leftData - 1), cutCounter, cutTag);
|
||||
} else if (inComment) {
|
||||
await _onComment(xstr.substr(leftData + 1, len - leftData - 1), cutCounter, cutTag);
|
||||
} else {
|
||||
await _onTextNode(xstr.substr(i, len - i), cutCounter, cutTag);
|
||||
}
|
||||
}
|
||||
|
||||
await _onProgress(100);
|
||||
}
|
||||
|
||||
function getAttrsSync(tail, lowerCase = true) {
|
||||
let result = {};
|
||||
let name = '';
|
||||
let value = '';
|
||||
let vOpen = '';
|
||||
let inName = false;
|
||||
let inValue = false;
|
||||
let waitValue = false;
|
||||
let waitEq = false;
|
||||
|
||||
const pushResult = () => {
|
||||
if (lowerCase)
|
||||
name = name.toLowerCase();
|
||||
if (name != '') {
|
||||
const fn = name;
|
||||
let ns = '';
|
||||
if (fn.indexOf(':') >= 0) {
|
||||
[ns, name] = fn.split(':');
|
||||
}
|
||||
|
||||
result[name] = {value, ns, fn};
|
||||
}
|
||||
name = '';
|
||||
value = '';
|
||||
vOpen = '';
|
||||
inName = false;
|
||||
inValue = false;
|
||||
waitValue = false;
|
||||
waitEq = false;
|
||||
};
|
||||
|
||||
tail = tail.replace(/[\t\n\r]/g, ' ');
|
||||
for (let i = 0; i < tail.length; i++) {
|
||||
const c = tail.charAt(i);
|
||||
if (c == ' ') {
|
||||
if (inValue) {
|
||||
if (vOpen == '"')
|
||||
value += c;
|
||||
else
|
||||
pushResult();
|
||||
} else if (inName) {
|
||||
waitEq = true;
|
||||
inName = false;
|
||||
}
|
||||
} else if (!inValue && c == '=') {
|
||||
waitEq = false;
|
||||
waitValue = true;
|
||||
inName = false;
|
||||
} else if (c == '"') {
|
||||
if (inValue) {
|
||||
pushResult();
|
||||
} else if (waitValue) {
|
||||
inValue = true;
|
||||
vOpen = '"';
|
||||
}
|
||||
} else if (inValue) {
|
||||
value += c;
|
||||
} else if (inName) {
|
||||
name += c;
|
||||
} else if (waitEq) {
|
||||
pushResult();
|
||||
inName = true;
|
||||
name = c;
|
||||
} else if (waitValue) {
|
||||
waitValue = false;
|
||||
inValue = true;
|
||||
vOpen = ' ';
|
||||
value = c;
|
||||
} else {
|
||||
inName = true;
|
||||
name = c;
|
||||
}
|
||||
}
|
||||
if (name != '')
|
||||
pushResult();
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
module.exports = {
|
||||
parseSync,
|
||||
getAttrsSync,
|
||||
parse
|
||||
}
|
||||
130
server/core/xml/textUtils.js
Normal file
130
server/core/xml/textUtils.js
Normal file
@@ -0,0 +1,130 @@
|
||||
const chardet = require('chardet');
|
||||
|
||||
function getEncoding(buf) {
|
||||
let selected = getEncodingLite(buf);
|
||||
|
||||
if (selected == 'ISO-8859-5' && buf.length > 10) {
|
||||
const charsetAll = chardet.analyse(buf.slice(0, 20000));
|
||||
for (const charset of charsetAll) {
|
||||
if (charset.name.indexOf('ISO-8859') < 0) {
|
||||
selected = charset.name;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return selected;
|
||||
}
|
||||
|
||||
|
||||
function getEncodingLite(buf, returnAll) {
|
||||
const lowerCase = 3;
|
||||
const upperCase = 1;
|
||||
|
||||
const codePage = {
|
||||
'k': 'koi8-r',
|
||||
'w': 'Windows-1251',
|
||||
'd': 'cp866',
|
||||
'i': 'ISO-8859-5',
|
||||
'm': 'maccyrillic',
|
||||
'u': 'utf-8',
|
||||
};
|
||||
|
||||
let charsets = {
|
||||
'k': 0,
|
||||
'w': 0,
|
||||
'd': 0,
|
||||
'i': 0,
|
||||
'm': 0,
|
||||
'u': 0,
|
||||
};
|
||||
|
||||
const len = buf.length;
|
||||
const blockSize = (len > 5*3000 ? 3000 : len);
|
||||
let counter = 0;
|
||||
let i = 0;
|
||||
let totalChecked = 0;
|
||||
while (i < len) {
|
||||
const char = buf[i];
|
||||
const nextChar = (i < len - 1 ? buf[i + 1] : 0);
|
||||
totalChecked++;
|
||||
i++;
|
||||
//non-russian characters
|
||||
if (char < 128 || char > 256)
|
||||
continue;
|
||||
//UTF-8
|
||||
if ((char == 208 || char == 209) && nextChar >= 128 && nextChar <= 190)
|
||||
charsets['u'] += lowerCase;
|
||||
else {
|
||||
//CP866
|
||||
if ((char > 159 && char < 176) || (char > 223 && char < 242)) charsets['d'] += lowerCase;
|
||||
if ((char > 127 && char < 160)) charsets['d'] += upperCase;
|
||||
|
||||
//KOI8-R
|
||||
if ((char > 191 && char < 223)) charsets['k'] += lowerCase;
|
||||
if ((char > 222 && char < 256)) charsets['k'] += upperCase;
|
||||
|
||||
//WIN-1251
|
||||
if (char > 223 && char < 256) charsets['w'] += lowerCase;
|
||||
if (char > 191 && char < 224) charsets['w'] += upperCase;
|
||||
|
||||
//MAC
|
||||
if (char > 221 && char < 255) charsets['m'] += lowerCase;
|
||||
if (char > 127 && char < 160) charsets['m'] += upperCase;
|
||||
|
||||
//ISO-8859-5
|
||||
if (char > 207 && char < 240) charsets['i'] += lowerCase;
|
||||
if (char > 175 && char < 208) charsets['i'] += upperCase;
|
||||
}
|
||||
|
||||
counter++;
|
||||
|
||||
if (counter > blockSize) {
|
||||
counter = 0;
|
||||
i += Math.round(len/2 - 2*blockSize);
|
||||
}
|
||||
}
|
||||
|
||||
let sorted = Object.keys(charsets).map(function(key) {
|
||||
return { codePage: codePage[key], c: charsets[key], totalChecked };
|
||||
});
|
||||
|
||||
sorted.sort((a, b) => b.c - a.c);
|
||||
|
||||
if (returnAll)
|
||||
return sorted;
|
||||
else if (sorted[0].c > 0 && sorted[0].c > sorted[0].totalChecked/2)
|
||||
return sorted[0].codePage;
|
||||
else
|
||||
return 'ISO-8859-5';
|
||||
}
|
||||
|
||||
function checkIfText(buf) {
|
||||
const enc = getEncodingLite(buf, true);
|
||||
if (enc[0].c > enc[0].totalChecked*0.9)
|
||||
return true;
|
||||
|
||||
let spaceCount = 0;
|
||||
let crCount = 0;
|
||||
let lfCount = 0;
|
||||
for (let i = 0; i < buf.length; i++) {
|
||||
if (buf[i] == 32)
|
||||
spaceCount++;
|
||||
if (buf[i] == 13)
|
||||
crCount++;
|
||||
if (buf[i] == 10)
|
||||
lfCount++;
|
||||
}
|
||||
|
||||
const spaceFreq = spaceCount/(buf.length + 1);
|
||||
const crFreq = crCount/(buf.length + 1);
|
||||
const lfFreq = lfCount/(buf.length + 1);
|
||||
|
||||
return (buf.length < 1000 || spaceFreq > 0.1 || crFreq > 0.03 || lfFreq > 0.03);
|
||||
}
|
||||
|
||||
module.exports = {
|
||||
getEncoding,
|
||||
getEncodingLite,
|
||||
checkIfText,
|
||||
}
|
||||
Reference in New Issue
Block a user