Files
liberama/server/core/BookConverter/index.js
2019-01-27 01:14:50 +07:00

382 lines
12 KiB
JavaScript
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
const fs = require('fs-extra');
const URL = require('url').URL;
const iconv = require('iconv-lite');
const chardet = require('chardet');
const _ = require('lodash');
const sax = require('./sax');
const FileDetector = require('../FileDetector');
class BookConverter {
constructor() {
this.detector = new FileDetector();
}
async convertToFb2(inputFile, outputFile, url, callback) {
const fileType = await this.detector.detectFile(inputFile);
if (fileType && (fileType.ext == 'html' || fileType.ext == 'xml')) {
const data = await fs.readFile(inputFile);
if (data.toString().indexOf('<FictionBook') >= 0) {
await fs.writeFile(outputFile, data);
return;
}
const parsedUrl = new URL(url);
if (parsedUrl.hostname == 'samlib.ru' ||
parsedUrl.hostname == 'budclub.ru') {
await fs.writeFile(outputFile, await this.convertSamlib(data));
return;
}
//Заглушка
await fs.writeFile(outputFile, await this.convertHtml(data));
callback(100);
} else {
if (fileType)
throw new Error(`unknown file format: ${fileType.mime}`);
else
throw new Error(`unsupported file format: ${url}`);
}
}
decode(data) {
const charsetAll = chardet.detectAll(data.slice(0, 10000));
let selected = 'ISO-8859-1';
for (const charset of charsetAll) {
if (charset.name.indexOf('ISO-8859') < 0) {
selected = charset.name;
break;
}
}
return iconv.decode(data, selected);
}
convertHtml(data, isText) {
let titleInfo = {};
let desc = {_n: 'description', 'title-info': titleInfo};
let pars = [];
let body = {_n: 'body', section: {_a: []}};
let fb2 = [desc, body];
let title = '';
let inTitle = false;
let spaceCounter = [];
const newParagraph = () => {
pars.push({_n: 'p', _t: ''});
};
const growParagraph = (text) => {
const l = pars.length;
if (l) {
if (pars[l - 1]._t == '')
text = text.trimLeft();
pars[l - 1]._t += text;
}
//посчитаем отступы у текста, чтобы выделить потом параграфы
const lines = text.split('\n');
for (const line of lines) {
const sp = line.split(' ');
let l = 0;
while (l < sp.length && sp[l].trim() == '') {
l++;
}
if (!spaceCounter[l])
spaceCounter[l] = 0;
spaceCounter[l]++;
}
};
newParagraph();
const newPara = new Set(['tr', 'br', 'br/', 'dd', 'p', 'title', '/title', 'h1', 'h2', 'h3', '/h1', '/h2', '/h3']);
const onText = (text, cutCounter, cutTag) => {// eslint-disable-line no-unused-vars
if (!cutCounter) {
growParagraph(text);
}
if (inTitle && !title)
title = text;
};
const onNode = (tag, tail, cutCounter, cutTag) => {// eslint-disable-line no-unused-vars
if (!cutCounter) {
if (newPara.has(tag))
newParagraph();
}
if (tag == 'title')
inTitle = true;
else if (tag == '/title')
inTitle = false;
};
let buf = this.decode(data).toString();
this.parseHtml(buf, onNode, onText, new Set(['head', 'script', 'style']));
titleInfo['book-title'] = title;
//подозрение на чистый текст, надо разбить на параграфы
if ((isText || pars.length < buf.length/2000) && spaceCounter.length) {
let total = 0;
for (let i = 0; i < spaceCounter.length; i++) {
total += (spaceCounter[i] ? spaceCounter[i] : 0);
}
total /= 10;
let i = spaceCounter.length - 1;
while (i > 0 && (!spaceCounter[i] || spaceCounter[i] < total)) i--;
const parIndent = i;
if (parIndent > 0) {//нашли отступ параграфа
let newPars = [];
const newPar = () => {
newPars.push({_n: 'p', _t: ''});
};
const growPar = (text) => {
const l = newPars.length;
if (l) {
newPars[l - 1]._t += text;
}
}
for (const par of pars) {
newPar();
const lines = par._t.split('\n');
for (const line of lines) {
const sp = line.split(' ');
let l = 0;
while (l < sp.length && sp[l].trim() == '') {
l++;
}
if (l >= parIndent)
newPar();
growPar(line.trim() + ' ');
}
}
body.section._a[0] = newPars;
} else {
body.section._a[0] = pars;
}
} else {
body.section._a[0] = pars;
}
return this.formatFb2(fb2);
}
async convertSamlib(data) {
let titleInfo = {};
let desc = {_n: 'description', 'title-info': titleInfo};
let pars = [];
let body = {_n: 'body', section: {_a: [pars]}};
let fb2 = [desc, body];
let inSubtitle = false;
let path = '';
let tag = '';// eslint-disable-line no-unused-vars
let inText = false;
let node = {};
const newParagraph = () => {
node = {_n: 'p', _a: []};
pars.push(node);
};
const openTag = (name) => {
let n = {_n: name, _a: [], _p: node};
node._a.push(n);
node = n;
};
const closeTag = (name) => {
if (node._n == name && node._p) {
node = node._p;
}
};
const growParagraph = (text) => {
if (node._n == 'p' && node._a.length == 0)
text = text.trimLeft();
node._a.push({_t: text});
};
newParagraph();
const onStartNode = (elemName, tail, left, cutCounter, cutTag) => {// eslint-disable-line no-unused-vars
if (elemName == '')
return;
if (!inText) {
path += '/' + elemName;
tag = elemName;
} else {
if (!inSubtitle && (elemName == 'p' || elemName == 'dd')) {
newParagraph();
}
switch (elemName) {
case 'i':
openTag('emphasis');
break;
case 'b':
openTag('strong');
break;
case 'div':
if (tail.indexOf('align="center"') >= 0) {
openTag('subtitle');
inSubtitle = true;
}
break;
}
}
};
const onEndNode = (elemName, tail, left, cutCounter, cutTag) => {// eslint-disable-line no-unused-vars
if (!inText) {
const oldPath = path;
let t = '';
do {
let i = path.lastIndexOf('/');
t = path.substr(i + 1);
path = path.substr(0, i);
} while (t != elemName && path);
if (t != elemName) {
path = oldPath;
}
let i = path.lastIndexOf('/');
tag = path.substr(i + 1);
} else {
switch (elemName) {
case 'i':
closeTag('emphasis');
break;
case 'b':
closeTag('strong');
break;
case 'div':
if (inSubtitle)
closeTag('subtitle');
inSubtitle = false;
break;
}
}
};
const onComment = (text, left, cutCounter, cutTag) => {// eslint-disable-line no-unused-vars
if (text == '--------- Собственно произведение -------------')
inText = true;
if (text == '-----------------------------------------------')
inText = false;
};
const onTextNode = (text, left, cutCounter, cutTag) => {// eslint-disable-line no-unused-vars
if (text != ' ' && text.trim() == '')
text = text.trim();
if (text == '')
return;
switch (path) {
case '/html/body/center/h2':
titleInfo['book-title'] = text;
return;
case '/html/body/div/h3':
if (!titleInfo.author)
titleInfo.author = {};
text = text.replace(':', '').trim().split(' ');
if (text[0])
titleInfo.author['last-name'] = text[0];
if (text[1])
titleInfo.author['first-name'] = text[1];
if (text[2])
titleInfo.author['middle-name'] = text[2];
return;
}
if (inText)
growParagraph(text);
};
sax.parse(this.decode(data).toString(), {
onStartNode, onEndNode, onTextNode, onComment,
innerCut: new Set(['head', 'script', 'style'])
});
const title = (titleInfo['book-title'] ? titleInfo['book-title'] : '');
let author = '';
if (titleInfo.author) {
author = _.compact([
(titleInfo.author['last-name'] ? titleInfo.author['last-name'] : ''),
(titleInfo.author['first-name'] ? titleInfo.author['first-name'] : ''),
(titleInfo.author['middle-name'] ? titleInfo.author['middle-name'] : ''),
]).join(' ');
}
pars.unshift({_n: 'title', _a: [
{_n: 'p', _t: author}, {_n: 'p', _t: ''},
{_n: 'p', _t: title}, {_n: 'p', _t: ''},
]})
return this.formatFb2(fb2);
}
formatFb2(fb2) {
let out = '<?xml version="1.0" encoding="utf-8"?>';
out += '<FictionBook xmlns="http://www.gribuser.ru/xml/fictionbook/2.0" xmlns:l="http://www.w3.org/1999/xlink">';
out += this.formatFb2Node(fb2);
out += '</FictionBook>';
return out;
}
formatFb2Node(node, name) {
let out = '';
const repl = (text) => text.replace(/&nbsp;|[\t\n\r]/g, ' ');
if (Array.isArray(node)) {
for (const n of node) {
out += this.formatFb2Node(n);
}
} else if (typeof node == 'string') {
if (name)
out += `<${name}>${repl(node)}</${name}>`;
else
out += repl(node);
} else {
if (node._n)
name = node._n;
if (name)
out += `<${name}>`;
if (node.hasOwnProperty('_t'))
out += repl(node._t);
for (let nodeName in node) {
if (nodeName && nodeName[0] == '_' && nodeName != '_a')
continue;
const n = node[nodeName];
out += this.formatFb2Node(n, nodeName);
}
if (name)
out += `</${name}>`;
}
return out;
}
}
module.exports = BookConverter;