Files
liberama/server/core/Reader/BookConverter/ConvertHtml.js
2019-10-29 19:07:34 +07:00

300 lines
9.0 KiB
JavaScript
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
const ConvertBase = require('./ConvertBase');
const sax = require('../../sax');
const textUtils = require('./textUtils');
class ConvertHtml extends ConvertBase {
check(data, opts) {
const {dataType} = opts;
if (dataType && (dataType.ext == 'html' || dataType.ext == 'xml'))
return {isText: false};
//может это чистый текст?
if (textUtils.checkIfText(data)) {
return {isText: true};
}
return false;
}
async run(data, opts) {
let isText = false;
if (!opts.skipCheck) {
const checkResult = this.check(data, opts);
if (!checkResult)
return false;
isText = checkResult.isText;
} else {
isText = opts.isText;
}
let {cutTitle} = opts;
let titleInfo = {};
let desc = {_n: 'description', 'title-info': titleInfo};
let pars = [];
let body = {_n: 'body', section: {_a: []}};
let binary = [];
let fb2 = [desc, body, binary];
let title = '';
let inTitle = false;
let inImage = false;
let image = {};
let bold = false;
let italic = false;
let spaceCounter = [];
const repCrLfTab = (text) => text.replace(/[\n\r]/g, '').replace(/\t/g, ' ');
const newParagraph = () => {
pars.push({_n: 'p', _t: ''});
};
const growParagraph = (text) => {
if (!pars.length)
newParagraph();
const l = pars.length;
pars[l - 1]._t += text;
//посчитаем отступы у текста, чтобы выделить потом параграфы
const lines = text.split('\n');
for (let line of lines) {
if (line.trim() == '')
continue;
line = repCrLfTab(line);
let l = 0;
while (l < line.length && line[l] == ' ') {
l++;
}
if (!spaceCounter[l])
spaceCounter[l] = 0;
spaceCounter[l]++;
}
};
const newPara = new Set(['tr', '/table', 'hr', 'br', 'br/', 'li', 'dt', 'dd', 'p', 'title', '/title', 'h1', 'h2', 'h3', '/h1', '/h2', '/h3']);
const onTextNode = (text, cutCounter, cutTag) => {// eslint-disable-line no-unused-vars
text = this.escapeEntities(text);
if (!cutCounter && !(cutTitle && inTitle)) {
let tOpen = (bold ? '<strong>' : '');
tOpen += (italic ? '<emphasis>' : '');
let tClose = (italic ? '</emphasis>' : '');
tClose += (bold ? '</strong>' : '');
growParagraph(`${tOpen}${text}${tClose}`);
}
if (inTitle && !title)
title = text;
if (inImage) {
image._t = text;
binary.push(image);
pars.push({_n: 'image', _attrs: {'l:href': '#' + image._attrs.id}, _t: ''});
newParagraph();
}
};
const onStartNode = (tag, tail, singleTag, cutCounter, cutTag) => {// eslint-disable-line no-unused-vars
if (!cutCounter) {
if (newPara.has(tag))
newParagraph();
switch (tag) {
case 'i':
case 'em':
italic = true;
break;
case 'b':
case 'strong':
case 'h1':
case 'h2':
case 'h3':
bold = true;
break;
}
}
if (tag == 'title' || tag == 'cut-title') {
inTitle = true;
if (tag == 'cut-title')
cutTitle = true;
}
if (tag == 'fb2-image') {
inImage = true;
const attrs = sax.getAttrsSync(tail);
image = {_n: 'binary', _attrs: {id: attrs.name.value, 'content-type': attrs.type.value}, _t: ''};
}
};
const onEndNode = (tag, tail, singleTag, cutCounter, cutTag) => {// eslint-disable-line no-unused-vars
if (!cutCounter) {
if (newPara.has('/' + tag))
newParagraph();
switch (tag) {
case 'i':
case 'em':
italic = false;
break;
case 'b':
case 'strong':
case 'h1':
case 'h2':
case 'h3':
bold = false;
break;
}
}
if (tag == 'title' || tag == 'cut-title')
inTitle = false;
if (tag == 'fb2-image')
inImage = false;
};
let buf = this.decode(data).toString();
sax.parseSync(buf, {
onStartNode, onEndNode, onTextNode,
innerCut: new Set(['head', 'script', 'style', 'binary', 'fb2-image'])
});
titleInfo['book-title'] = title;
//подозрение на чистый текст, надо разбить на параграфы
if (isText || pars.length < buf.length/2000) {
let total = 0;
let count = 1;
for (let i = 0; i < spaceCounter.length; i++) {
const sc = (spaceCounter[i] ? spaceCounter[i] : 0);
if (sc) count++;
total += sc;
}
let d = 0;
const mid = total/count;
for (let i = 0; i < spaceCounter.length; i++) {
const sc = (spaceCounter[i] ? spaceCounter[i] : 0);
if (sc > mid) d++;
}
let i = 0;
//если разброс не слишком большой, выделяем параграфы
if (d < 10 && spaceCounter.length) {
total /= 20;
i = spaceCounter.length - 1;
while (i > 0 && (!spaceCounter[i] || spaceCounter[i] < total)) i--;
}
const parIndent = (i > 0 ? i : 0);
let newPars = [];
const newPar = () => {
newPars.push({_n: 'p', _t: ''});
};
const growPar = (text) => {
if (!newPars.length)
newPar();
const l = newPars.length;
newPars[l - 1]._t += text;
}
i = 0;
for (const par of pars) {
if (par._n != 'p') {
newPars.push(par);
continue;
}
if (i > 0)
newPar();
i++;
let j = 0;
const lines = par._t.split('\n');
for (let line of lines) {
line = repCrLfTab(line);
let l = 0;
while (l < line.length && line[l] == ' ') {
l++;
}
if (l >= parIndent) {
if (j > 0)
newPar();
j++;
}
growPar(line.trim() + ' ');
}
}
body.section._a[0] = newPars;
} else {
body.section._a[0] = pars;
}
//убираем лишнее, делаем валидный fb2, т.к. в рез-те разбиения на параграфы бьются теги
bold = false;
italic = false;
pars = body.section._a[0];
for (let i = 0; i < pars.length; i++) {
if (pars[i]._n != 'p')
continue;
pars[i]._t = this.repSpaces(pars[i]._t).trim();
if (pars[i]._t.indexOf('<') >= 0) {
const t = pars[i]._t;
let a = [];
const onTextNode = (text) => {
let tOpen = (bold ? '<strong>' : '');
tOpen += (italic ? '<emphasis>' : '');
let tClose = (italic ? '</emphasis>' : '');
tClose += (bold ? '</strong>' : '');
a.push(`${tOpen}${text}${tClose}`);
}
const onStartNode = (tag) => {
if (tag == 'strong')
bold = true;
if (tag == 'emphasis')
italic = true;
}
const onEndNode = (tag) => {
if (tag == 'strong')
bold = false;
if (tag == 'emphasis')
italic = false;
}
sax.parseSync(t, { onStartNode, onEndNode, onTextNode });
pars[i]._t = '';
pars[i]._a = a;
}
}
return this.formatFb2(fb2);
}
}
module.exports = ConvertHtml;