Compare commits

...

27 Commits

Author SHA1 Message Date
Book Pauk
7fa891b4fc Merge branch 'release/0.9.11' 2020-12-09 22:31:33 +07:00
Book Pauk
6cb7412cf3 Версия 0.9.11 2020-12-09 22:30:58 +07:00
Book Pauk
157322834b Небольшая поправка 2020-12-09 22:30:19 +07:00
Book Pauk
1a13a0fee1 Работа над конвертером pdf 2020-12-09 22:19:14 +07:00
Book Pauk
37256255bf Добавлена поддержка тегов 'sup' и 'sub' 2020-12-09 20:35:52 +07:00
Book Pauk
75e01c899e Работа над конвертером pdf 2020-12-09 20:08:17 +07:00
Book Pauk
ef0d6eab89 Работа над конвертером Pdf 2020-12-09 19:05:09 +07:00
Book Pauk
5d54b1b0f4 Работа над конвертером pdf 2020-12-09 03:52:24 +07:00
Book Pauk
522f953b4f Работа над конвертером pdf 2020-12-09 03:06:15 +07:00
Book Pauk
15f02c7115 Работа над конвертером pdf 2020-12-09 01:29:58 +07:00
Book Pauk
174c877eee Рефакторинг, плюс небольшие доработки 2020-12-09 01:29:09 +07:00
Book Pauk
fd9ec736d7 Рефакторинг 2020-12-08 19:36:53 +07:00
Book Pauk
2c94025ba3 Поправлен баг 2020-12-08 19:31:00 +07:00
Book Pauk
bfadf35c40 Закончена работа над xmlParser, оттестировано 2020-12-08 18:48:55 +07:00
Book Pauk
f3b69caa12 Работа над модулем xmlParser 2020-12-08 16:17:36 +07:00
Book Pauk
18a83a5b0b Поправки настроек сжатия 2020-12-08 14:26:49 +07:00
Book Pauk
bd9669b782 Поправка цели dev 2020-12-08 14:26:25 +07:00
Book Pauk
e05713aa7f Работа над конвертером pdf 2020-12-08 14:15:17 +07:00
Book Pauk
bc3e1f0a6f Мелкий рефакторинг 2020-12-07 22:13:14 +07:00
Book Pauk
063d01b5ca Перевод pdf-конвертера на использование pdfalto 2020-12-07 22:05:01 +07:00
Book Pauk
81c38d7749 Мелкий рефакторинг 2020-12-07 20:13:32 +07:00
Book Pauk
a29842b084 Поправка readme 2020-12-07 20:12:37 +07:00
Book Pauk
bb5adcdaf6 Рефакторинг 2020-12-07 01:30:10 +07:00
Book Pauk
537e17a219 Merge tag '0.9.10-5' into develop
0.9.10-5
2020-12-05 13:42:45 +07:00
Book Pauk
03ce50153e Merge branch 'release/0.9.10-5' 2020-12-05 13:42:39 +07:00
Book Pauk
15d01ad7fc Коррекция таймаутов очереди ожидания 2020-12-05 13:41:42 +07:00
Book Pauk
e2b29e2c2f Merge tag '0.9.10-4' into develop
0.9.10-4
2020-12-05 13:25:10 +07:00
17 changed files with 554 additions and 219 deletions

View File

@@ -66,7 +66,7 @@ class Reader {
await utils.sleep(refreshPause);
i++;
if (i > 120*1000/refreshPause) {//2 мин ждем телодвижений воркера
if (i > 180*1000/refreshPause) {//3 мин ждем телодвижений воркера
throw new Error('Слишком долгое время ожидания');
}
//проверка воркера

View File

@@ -77,9 +77,15 @@ export default class DrawHelper {
let j = 0;
//формируем строку
for (const part of line.parts) {
let tOpen = (part.style.bold ? '<b>' : '');
let tOpen = '';
tOpen += (part.style.bold ? '<b>' : '');
tOpen += (part.style.italic ? '<i>' : '');
let tClose = (part.style.italic ? '</i>' : '');
tOpen += (part.style.sup ? '<span style="vertical-align: baseline; position: relative; line-height: 0; top: -0.3em">' : '');
tOpen += (part.style.sub ? '<span style="vertical-align: baseline; position: relative; line-height: 0; top: 0.3em">' : '');
let tClose = '';
tClose += (part.style.sub ? '</span>' : '');
tClose += (part.style.sup ? '</span>' : '');
tClose += (part.style.italic ? '</i>' : '');
tClose += (part.style.bold ? '</b>' : '');
let text = '';

View File

@@ -285,7 +285,7 @@ export default class BookParser {
sectionLevel++;
}
if (tag == 'emphasis' || tag == 'strong') {
if (tag == 'emphasis' || tag == 'strong' || tag == 'sup' || tag == 'sub') {
growParagraph(`<${tag}>`, 0);
}
@@ -343,7 +343,7 @@ export default class BookParser {
sectionLevel--;
}
if (tag == 'emphasis' || tag == 'strong') {
if (tag == 'emphasis' || tag == 'strong' || tag == 'sup' || tag == 'sub') {
growParagraph(`</${tag}>`, 0);
}
@@ -507,7 +507,7 @@ export default class BookParser {
splitToStyle(s) {
let result = [];/*array of {
style: {bold: Boolean, italic: Boolean, center: Boolean, space: Number},
style: {bold: Boolean, italic: Boolean, sup: Boolean, sub: Boolean, center: Boolean, space: Number},
image: {local: Boolean, inline: Boolean, id: String},
text: String,
}*/
@@ -530,6 +530,12 @@ export default class BookParser {
case 'emphasis':
style.italic = true;
break;
case 'sup':
style.sup = true;
break;
case 'sub':
style.sub = true;
break;
case 'center':
style.center = true;
break;
@@ -580,6 +586,12 @@ export default class BookParser {
case 'emphasis':
style.italic = false;
break;
case 'sup':
style.sup = false;
break;
case 'sub':
style.sub = false;
break;
case 'center':
style.center = false;
break;

View File

@@ -169,7 +169,7 @@ class BookManager {
}
async deflateWithProgress(data, callback) {
const chunkSize = 128*1024;
const chunkSize = 512*1024;
const deflator = new utils.pako.Deflate({level: 5});
let chunkTotal = 1 + Math.floor(data.length/chunkSize);
@@ -203,7 +203,7 @@ class BookManager {
}
async inflateWithProgress(data, callback) {
const chunkSize = 64*1024;
const chunkSize = 512*1024;
const inflator = new utils.pako.Inflate({to: 'string'});
let chunkTotal = 1 + Math.floor(data.length/chunkSize);

View File

@@ -1,4 +1,15 @@
export const versionHistory = [
{
showUntil: '2020-12-08',
header: '0.9.11 (2020-12-09)',
content:
`
<ul>
<li>оптимизации, улучшения работы конвертеров</li>
</ul>
`
},
{
showUntil: '2020-12-10',
header: '0.9.10 (2020-12-03)',

View File

@@ -32,11 +32,23 @@ sudo -u www-data mkdir -p /home/liberama/data/calibre
sudo -u www-data tar xvf calibre-5.5.0-x86_64.txz -C /home/liberama/data/calibre
```
### external converter `pdfalto`, github https://github.com/kermitt2/pdfalto
```
git clone https://github.com/kermitt2/pdfalto
cd pdfalto
git submodule update --init --recursive
cmake ./
добавить в начало CMakeLists.txt строчку: set(CMAKE_EXE_LINKER_FLAGS "-no-pie")
make
sudo -u www-data mkdir -p /home/liberama/data/pdfalto
sudo -u www-data cp pdfalto /home/liberama/data/pdfalto
```
### external converters
```
sudo apt install rar
sudo apt install libreoffice
sudo apt install poppler-utils
sudo apt install djvulibre-bin
sudo apt install libtiff-tools
sudo apt install graphicsmagick-imagemagick-compat

2
package-lock.json generated
View File

@@ -1,6 +1,6 @@
{
"name": "Liberama",
"version": "0.9.10",
"version": "0.9.11",
"lockfileVersion": 1,
"requires": true,
"dependencies": {

View File

@@ -1,6 +1,6 @@
{
"name": "Liberama",
"version": "0.9.10",
"version": "0.9.11",
"author": "Book Pauk <bookpauk@gmail.com>",
"license": "CC0-1.0",
"repository": "bookpauk/liberama",
@@ -8,7 +8,7 @@
"node": ">=10.0.0"
},
"scripts": {
"dev": "nodemon --inspect --ignore server/public --ignore server/data --exec 'node server'",
"dev": "nodemon --inspect --ignore server/public --ignore server/data --ignore client --exec 'node server'",
"build:client": "webpack --config build/webpack.prod.config.js",
"build:linux": "npm run build:client && node build/linux && pkg -t latest-linux-x64 -o dist/linux/liberama .",
"build:win": "npm run build:client && node build/win && pkg -t latest-win-x64 -o dist/win/liberama .",

View File

@@ -136,7 +136,7 @@ class WebSocketController {
break;
i++;
if (i > 2*60*1000/refreshPause) {//2 мин ждем телодвижений воркера
if (i > 3*60*1000/refreshPause) {//3 мин ждем телодвижений воркера
this.send({state: 'error', error: 'Время ожидания процесса истекло'}, req, ws);
break;
}

View File

@@ -5,8 +5,9 @@ const he = require('he');
const LimitedQueue = require('../../LimitedQueue');
const textUtils = require('./textUtils');
const utils = require('../../utils');
const xmlParser = require('../../xmlParser');
const queue = new LimitedQueue(3, 20, 3*60*1000);//3 минуты ожидание подвижек
const queue = new LimitedQueue(3, 20, 2*60*1000);//2 минуты ожидание подвижек
class ConvertBase {
constructor(config) {
@@ -14,7 +15,6 @@ class ConvertBase {
this.calibrePath = `${config.dataDir}/calibre/ebook-convert`;
this.sofficePath = '/usr/bin/soffice';
this.pdfToHtmlPath = '/usr/bin/pdftohtml';
}
async run(data, opts) {// eslint-disable-line no-unused-vars
@@ -27,9 +27,6 @@ class ConvertBase {
if (!await fs.pathExists(this.sofficePath))
throw new Error('Внешний конвертер LibreOffice не найден');
if (!await fs.pathExists(this.pdfToHtmlPath))
throw new Error('Внешний конвертер pdftohtml не найден');
}
async execConverter(path, args, onData, abort) {
@@ -106,61 +103,14 @@ class ConvertBase {
}
formatFb2(fb2) {
let out = '<?xml version="1.0" encoding="utf-8"?>';
out += '<FictionBook xmlns="http://www.gribuser.ru/xml/fictionbook/2.0" xmlns:l="http://www.w3.org/1999/xlink">';
out += this.formatFb2Node(fb2);
out += '</FictionBook>';
return out;
}
formatFb2Node(node, name) {
let out = '';
if (Array.isArray(node)) {
for (const n of node) {
out += this.formatFb2Node(n);
const out = xmlParser.formatXml({
FictionBook: {
_attrs: {xmlns: 'http://www.gribuser.ru/xml/fictionbook/2.0', 'xmlns:l': 'http://www.w3.org/1999/xlink'},
_a: [fb2],
}
} else if (typeof node == 'string') {
if (name)
out += `<${name}>${this.repSpaces(node)}</${name}>`;
else
out += this.repSpaces(node);
} else {
if (node._n)
name = node._n;
}, 'utf-8', this.repSpaces);
let attrs = '';
if (node._attrs) {
for (let attrName in node._attrs) {
attrs += ` ${attrName}="${node._attrs[attrName]}"`;
}
}
let tOpen = '';
let tBody = '';
let tClose = '';
if (name)
tOpen += `<${name}${attrs}>`;
if (node.hasOwnProperty('_t'))
tBody += this.repSpaces(node._t);
for (let nodeName in node) {
if (nodeName && nodeName[0] == '_' && nodeName != '_a')
continue;
const n = node[nodeName];
tBody += this.formatFb2Node(n, nodeName);
}
if (name)
tClose += `</${name}>`;
if (attrs == '' && name == 'p' && tBody.trim() == '')
out += '<empty-line/>'
else
out += `${tOpen}${tBody}${tClose}`;
}
return out;
return out.replace(/<p>\s*?<\/p>/g, '<empty-line/>');
}
}

View File

@@ -2,9 +2,9 @@ const fs = require('fs-extra');
const path = require('path');
const utils = require('../../utils');
const ConvertHtml = require('./ConvertHtml');
const ConvertBase = require('./ConvertBase');
class ConvertDjvu extends ConvertHtml {
class ConvertDjvu extends ConvertBase {
check(data, opts) {
const {inputFiles} = opts;
@@ -59,9 +59,17 @@ class ConvertDjvu extends ConvertHtml {
}, abort);
//читаем изображения
limitSize = 2*this.config.maxUploadFileSize;
let imagesSize = 0;
const loadImage = async(image) => {
image.data = (await fs.readFile(image.file)).toString('base64');
image.name = path.basename(image.file);
imagesSize += image.data.length;
if (imagesSize > limitSize) {
throw new Error(`Файл для конвертирования слишком большой|FORLOG| imagesSize: ${imagesSize} > ${limitSize}`);
}
}
let files = [];
@@ -82,20 +90,29 @@ class ConvertDjvu extends ConvertHtml {
await Promise.all(loading);
//формируем текст
limitSize = 2*this.config.maxUploadFileSize;
//формируем fb2
let titleInfo = {};
let desc = {_n: 'description', 'title-info': titleInfo};
let pars = [];
let body = {_n: 'body', section: {_a: [pars]}};
let binary = [];
let fb2 = [desc, body, binary];
let title = '';
if (uploadFileName)
title = uploadFileName;
let text = `<title>${title}</title>`;
for (const image of images) {
text += `<fb2-image type="image/jpeg" name="${image.name}">${image.data}</fb2-image>`;
if (text.length > limitSize) {
throw new Error(`Файл для конвертирования слишком большой|FORLOG| text.length: ${text.length} > ${limitSize}`);
}
titleInfo['book-title'] = title;
for (const image of images) {
const img = {_n: 'binary', _attrs: {id: image.name, 'content-type': 'image/jpeg'}, _t: image.data};
binary.push(img);
pars.push({_n: 'p', _t: ''});
pars.push({_n: 'image', _attrs: {'l:href': `#${image.name}`}});
}
return await super.run(Buffer.from(text), {skipCheck: true, isText: true, cutTitle: true});
return this.formatFb2(fb2);
}
}

View File

@@ -2,7 +2,7 @@ const fs = require('fs-extra');
const ConvertHtml = require('./ConvertHtml');
class ConvertDocX extends ConvertHtml {
class ConvertFb3 extends ConvertHtml {
async check(data, opts) {
const {inputFiles} = opts;
if (this.config.useExternalBookConverter &&
@@ -39,13 +39,14 @@ class ConvertDocX extends ConvertHtml {
const title = this.getTitle(text)
.replace(/<\/?p>/g, '')
;
text = `<title>${title}</title>` + text
text = `<fb2-title>${title}</fb2-title>` + text
.replace(/<title>/g, '<br><b>')
.replace(/<\/title>/g, '</b><br>')
.replace(/<subtitle>/g, '<br><br><subtitle>')
.replace(/<subtitle>/g, '<br><br><fb2-subtitle>')
.replace(/<\/subtitle>/g, '</fb2-subtitle>')
;
return await super.run(Buffer.from(text), {skipCheck: true, cutTitle: true});
return await super.run(Buffer.from(text), {skipCheck: true});
}
}
module.exports = ConvertDocX;
module.exports = ConvertFb3;

View File

@@ -34,7 +34,6 @@ class ConvertHtml extends ConvertBase {
} else {
isText = opts.isText;
}
let {cutTitle} = opts;
let titleInfo = {};
let desc = {_n: 'description', 'title-info': titleInfo};
@@ -44,12 +43,17 @@ class ConvertHtml extends ConvertBase {
let fb2 = [desc, body, binary];
let title = '';
let author = '';
let inTitle = false;
let inSectionTitle = false;
let inAuthor = false;
let inSubTitle = false;
let inImage = false;
let image = {};
let bold = false;
let italic = false;
let superscript = false;
let subscript = false;
let begining = true;
let spaceCounter = [];
@@ -62,7 +66,7 @@ class ConvertHtml extends ConvertBase {
};
const growParagraph = (text) => {
if (!pars.length)
if (!pars.length || pars[pars.length - 1]._n != 'p')
newParagraph();
const l = pars.length;
@@ -94,12 +98,16 @@ class ConvertHtml extends ConvertBase {
const onTextNode = (text, cutCounter, cutTag) => {// eslint-disable-line no-unused-vars
text = this.escapeEntities(text);
if (!cutCounter && !(cutTitle && inTitle)) {
if (!(cutCounter || inTitle || inSectionTitle || inSubTitle)) {
let tOpen = '';
tOpen += (inSubTitle ? '<subtitle>' : '');
tOpen += (bold ? '<strong>' : '');
tOpen += (italic ? '<emphasis>' : '');
tOpen += (superscript ? '<sup>' : '');
tOpen += (subscript ? '<sub>' : '');
let tClose = ''
tClose += (subscript ? '</sub>' : '');
tClose += (superscript ? '</sup>' : '');
tClose += (italic ? '</emphasis>' : '');
tClose += (bold ? '</strong>' : '');
tClose += (inSubTitle ? '</subtitle>' : '');
@@ -110,12 +118,22 @@ class ConvertHtml extends ConvertBase {
if (inTitle && !title)
title = text;
if (inAuthor && !author)
author = text;
if (inSectionTitle) {
pars.unshift({_n: 'title', _t: text});
}
if (inSubTitle) {
pars.push({_n: 'subtitle', _t: text});
}
if (inImage) {
image._t = text;
binary.push(image);
pars.push({_n: 'image', _attrs: {'l:href': '#' + image._attrs.id}, _t: ''});
newParagraph();
}
};
@@ -140,15 +158,27 @@ class ConvertHtml extends ConvertBase {
bold = true;
break;
}
if (tag == 'sup')
superscript = true;
if (tag == 'sub')
subscript = true;
}
if (tag == 'title' || tag == 'cut-title') {
if (tag == 'title' || tag == 'fb2-title') {
inTitle = true;
if (tag == 'cut-title')
cutTitle = true;
}
if (tag == 'subtitle') {
if (tag == 'fb2-author') {
inAuthor = true;
}
if (tag == 'fb2-section-title') {
inSectionTitle = true;
}
if (tag == 'fb2-subtitle') {
inSubTitle = true;
}
@@ -156,7 +186,7 @@ class ConvertHtml extends ConvertBase {
inImage = true;
const attrs = sax.getAttrsSync(tail);
image = {_n: 'binary', _attrs: {id: attrs.name.value, 'content-type': attrs.type.value}, _t: ''};
}
}
};
const onEndNode = (tag, tail, singleTag, cutCounter, cutTag) => {// eslint-disable-line no-unused-vars
@@ -179,12 +209,26 @@ class ConvertHtml extends ConvertBase {
bold = false;
break;
}
if (tag == 'sup')
superscript = false;
if (tag == 'sub')
subscript = false;
}
if (tag == 'title' || tag == 'cut-title')
if (tag == 'title' || tag == 'fb2-title')
inTitle = false;
if (tag == 'subtitle')
if (tag == 'fb2-author') {
inAuthor = false;
}
if (tag == 'fb2-section-title') {
inSectionTitle = false;
}
if (tag == 'fb2-subtitle')
inSubTitle = false;
if (tag == 'fb2-image')
@@ -195,10 +239,17 @@ class ConvertHtml extends ConvertBase {
sax.parseSync(buf, {
onStartNode, onEndNode, onTextNode,
innerCut: new Set(['head', 'script', 'style', 'binary', 'fb2-image'])
innerCut: new Set(['head', 'script', 'style', 'binary', 'fb2-image', 'fb2-title', 'fb2-author'])
});
titleInfo['book-title'] = title;
if (author)
titleInfo.author = {'last-name': author};
body.section._a[0] = pars;
//console.log(JSON.stringify(fb2, null, 2));
//подозрение на чистый текст, надо разбить на параграфы
if (isText || (buf.length > 30*1024 && pars.length < buf.length/2000)) {
let total = 0;
@@ -228,56 +279,49 @@ class ConvertHtml extends ConvertBase {
if (parIndent > 2) parIndent--;
let newPars = [];
let curPar = {};
const newPar = () => {
newPars.push({_n: 'p', _t: ''});
curPar = {_n: 'p', _t: ''};
newPars.push(curPar);
};
const growPar = (text) => {
if (!newPars.length)
newPar();
const l = newPars.length;
newPars[l - 1]._t += text;
}
i = 0;
for (const par of pars) {
if (par._n != 'p') {
newPars.push(par);
continue;
}
if (i > 0)
newPar();
i++;
let j = 0;
newPar();
const lines = par._t.split('\n');
for (let line of lines) {
line = repCrLfTab(line);
for (let j = 0; j < lines.length; j++) {
const line = repCrLfTab(lines[j]);
let l = 0;
while (l < line.length && line[l] == ' ') {
l++;
}
if (l >= parIndent || line == '') {
if (j > 0)
newPar();
j++;
if (j > 0 &&
(l >= parIndent ||
(j < lines.length - 1 && line == '')
)
) {
newPar();
}
growPar(line.trim() + ' ');
curPar._t += line.trim() + ' ';
}
}
body.section._a[0] = newPars;
} else {
body.section._a[0] = pars;
}
//убираем лишнее, делаем валидный fb2, т.к. в рез-те разбиения на параграфы бьются теги
bold = false;
italic = false;
superscript = false;
subscript = false;
inSubTitle = false;
pars = body.section._a[0];
for (let i = 0; i < pars.length; i++) {
@@ -297,7 +341,11 @@ class ConvertHtml extends ConvertBase {
tOpen += (inSubTitle ? '<subtitle>' : '');
tOpen += (bold ? '<strong>' : '');
tOpen += (italic ? '<emphasis>' : '');
tOpen += (superscript ? '<sup>' : '');
tOpen += (subscript ? '<sub>' : '');
let tClose = ''
tClose += (subscript ? '</sub>' : '');
tClose += (superscript ? '</sup>' : '');
tClose += (italic ? '</emphasis>' : '');
tClose += (bold ? '</strong>' : '');
tClose += (inSubTitle ? '</subtitle>' : '');
@@ -313,6 +361,10 @@ class ConvertHtml extends ConvertBase {
bold = true;
if (tag == 'emphasis')
italic = true;
if (tag == 'sup')
superscript = true;
if (tag == 'sub')
subscript = true;
if (tag == 'subtitle')
inSubTitle = true;
}
@@ -322,6 +374,10 @@ class ConvertHtml extends ConvertBase {
bold = false;
if (tag == 'emphasis')
italic = false;
if (tag == 'sup')
superscript = false;
if (tag == 'sub')
subscript = false;
if (tag == 'subtitle')
inSubTitle = false;
}

View File

@@ -1,9 +1,11 @@
//const _ = require('lodash');
const fs = require('fs-extra');
const path = require('path');
const sax = require('../../sax');
const utils = require('../../utils');
const ConvertHtml = require('./ConvertHtml');
const xmlParser = require('../../xmlParser');
class ConvertPdf extends ConvertHtml {
check(data, opts) {
@@ -22,11 +24,18 @@ class ConvertPdf extends ConvertHtml {
const {inputFiles, callback, abort, uploadFileName} = opts;
const inpFile = inputFiles.sourceFile;
const outFile = `${inputFiles.filesDir}/${utils.randomHexString(10)}.xml`;
const outBasename = `${inputFiles.filesDir}/${utils.randomHexString(10)}`;
const outFile = `${outBasename}.xml`;
const metaFile = `${outBasename}_metadata.xml`;
const pdfaltoPath = `${this.config.dataDir}/pdfalto/pdfalto`;
if (!await fs.pathExists(pdfaltoPath))
throw new Error('Внешний конвертер pdfalto не найден');
//конвертируем в xml
let perc = 0;
await this.execConverter(this.pdfToHtmlPath, ['-nodrm', '-c', '-s', '-xml', inpFile, outFile], () => {
await this.execConverter(pdfaltoPath, [inpFile, outFile], () => {
perc = (perc < 80 ? perc + 10 : 40);
callback(perc);
}, abort);
@@ -35,17 +44,22 @@ class ConvertPdf extends ConvertHtml {
const data = await fs.readFile(outFile);
callback(90);
await utils.sleep(100);
//парсим xml
let lines = [];
let pagelines = [];
let line = {text: ''};
let page = {};
let fonts = {};
let sectionTitleFound = false;
let images = [];
let loading = [];
let inText = false;
let bold = false;
let italic = false;
let title = '';
let prevTop = 0;
let author = '';
let i = -1;
let titleCount = 0;
const loadImage = async(image) => {
const src = path.parse(image.src);
@@ -59,7 +73,7 @@ class ConvertPdf extends ConvertHtml {
image.type = type;
image.name = src.base;
}
}
};
const putImage = (curTop) => {
if (!isNaN(curTop) && images.length) {
@@ -69,104 +83,180 @@ class ConvertPdf extends ConvertHtml {
images.shift();
}
}
}
};
const onTextNode = (text, cutCounter, cutTag) => {// eslint-disable-line no-unused-vars
if (!cutCounter && inText) {
let tOpen = (bold ? '<b>' : '');
tOpen += (italic ? '<i>' : '');
let tClose = (italic ? '</i>' : '');
tClose += (bold ? '</b>' : '');
const putPageLines = () => {
pagelines.sort((a, b) => (a.top - b.top)*10000 + (a.left - b.left))
//объединяем в одну строку равные по высоте
const pl = [];
let pt = 0;
let j = -1;
pagelines.forEach(line => {
//добавим закрывающий тег стиля
line.text += line.tClose;
lines[i].text += `${tOpen}${text}${tClose} `;
if (titleCount < 2 && text.trim() != '') {
title += text + (titleCount ? '' : ' - ');
titleCount++;
//проверим, возможно это заголовок
if (line.fonts.length == 1 && line.pageWidth) {
const f = (line.fonts.length ? fonts[line.fonts[0]] : null);
const centerLeft = (line.pageWidth - line.width)/2;
if (f && f.isBold && Math.abs(centerLeft - line.left) < 3) {
if (!sectionTitleFound) {
line.isSectionTitle = true;
sectionTitleFound = true;
} else {
line.isSubtitle = true;
}
}
}
}
//объединяем
if (pt == 0 || Math.abs(pt - line.top) > 3) {
j++;
pl[j] = line;
} else {
pl[j].text += ` ${line.text}`;
}
pt = line.top;
});
//заполняем lines
const lastIndex = i;
pl.forEach(line => {
putImage(line.top);
//добавим пустую строку, если надо
const prevLine = (i > lastIndex ? lines[i] : {fonts: [], top: 0});
if (prevLine && !prevLine.isImage) {
const f = (prevLine.fonts.length ? fonts[prevLine.fonts[0]] : (line.fonts.length ? fonts[line.fonts[0]] : null));
if (f && f.fontSize && !line.isImage && line.top - prevLine.top > f.fontSize*1.8) {
i++;
lines[i] = {text: '<br>'};
}
}
i++;
lines[i] = line;
});
pagelines = [];
putImage(100000);
};
const onStartNode = (tag, tail, singleTag, cutCounter, cutTag) => {// eslint-disable-line no-unused-vars
if (!cutCounter) {
if (inText) {
switch (tag) {
case 'i':
italic = true;
break;
case 'b':
bold = true;
break;
if (tag == 'textstyle') {
const attrs = sax.getAttrsSync(tail);
const fontId = (attrs.id && attrs.id.value ? attrs.id.value : '');
const fontStyle = (attrs.fontstyle && attrs.fontstyle.value ? attrs.fontstyle.value : '');
const fontSize = (attrs.fontsize && attrs.fontsize.value ? attrs.fontsize.value : '');
if (fontId) {
const styleTags = {bold: 'b', italics: 'i', superscript: 'sup', subscript: 'sub'};
const f = fonts[fontId] = {tOpen: '', tClose: '', isBold: false, fontSize};
if (fontStyle) {
const styles = fontStyle.split(' ');
styles.forEach(style => {
const s = styleTags[style];
if (s) {
f.tOpen += `<${s}>`;
f.tClose = `</${s}>${f.tClose}`;
if (s == 'b')
f.isBold = true;
}
});
}
}
}
if (tag == 'text' && !inText) {
let attrs = sax.getAttrsSync(tail);
const line = {
text: '',
top: parseInt((attrs.top && attrs.top.value ? attrs.top.value : null), 10),
left: parseInt((attrs.left && attrs.left.value ? attrs.left.value : null), 10),
width: parseInt((attrs.width && attrs.width.value ? attrs.width.value : null), 10),
height: parseInt((attrs.height && attrs.height.value ? attrs.height.value : null), 10),
};
if (tag == 'page') {
const attrs = sax.getAttrsSync(tail);
page = {
width: parseInt((attrs.width && attrs.width.value ? attrs.width.value : null), 10),
};
if (line.width != 0 || line.height != 0) {
inText = true;
if (isNaN(line.top) || isNaN(prevTop) || (Math.abs(prevTop - line.top) > 3)) {
putImage(line.top);
i++;
lines[i] = line;
}
prevTop = line.top;
}
putPageLines();
}
if (tag == 'textline') {
const attrs = sax.getAttrsSync(tail);
line = {
text: '',
top: parseInt((attrs.vpos && attrs.vpos.value ? attrs.vpos.value : null), 10),
left: parseInt((attrs.hpos && attrs.hpos.value ? attrs.hpos.value : null), 10),
width: parseInt((attrs.width && attrs.width.value ? attrs.width.value : null), 10),
height: parseInt((attrs.height && attrs.height.value ? attrs.height.value : null), 10),
tOpen: '',
tClose: '',
isSectionTitle: false,
isSubtitle: false,
pageWidth: page.width,
fonts: [],
};
if (line.width != 0 || line.height != 0) {
pagelines.push(line);
}
}
if (tag == 'image') {
const attrs = sax.getAttrsSync(tail);
const src = (attrs.src && attrs.src.value ? attrs.src.value : '');
if (tag == 'string') {
const attrs = sax.getAttrsSync(tail);
if (attrs.content && attrs.content.value) {
let tOpen = '';
let tClose = '';
const fontId = (attrs.stylerefs && attrs.stylerefs.value ? attrs.stylerefs.value : '');
if (fontId && fonts[fontId]) {
tOpen = fonts[fontId].tOpen;
tClose = fonts[fontId].tClose;
if (!line.fonts.length || line.fonts[0] != fontId)
line.fonts.push(fontId);
}
if (line.tOpen != tOpen) {
line.text += line.tClose + tOpen;
line.tOpen = tOpen;
line.tClose = tClose;
}
line.text += `${line.text.length ? ' ' : ''}${attrs.content.value}`;
}
}
if (tag == 'illustration') {
const attrs = sax.getAttrsSync(tail);
if (attrs.type && attrs.type.value == 'image') {
let src = (attrs.fileid && attrs.fileid.value ? attrs.fileid.value : '');
if (src) {
const image = {
isImage: true,
src,
data: '',
type: '',
top: parseInt((attrs.top && attrs.top.value ? attrs.top.value : null), 10) || 0,
top: parseInt((attrs.vpos && attrs.vpos.value ? attrs.vpos.value : null), 10) || 0,
left: parseInt((attrs.hpos && attrs.hpos.value ? attrs.hpos.value : null), 10) || 0,
width: parseInt((attrs.width && attrs.width.value ? attrs.width.value : null), 10) || 0,
height: parseInt((attrs.height && attrs.height.value ? attrs.height.value : null), 10) || 0,
};
loading.push(loadImage(image));
images.push(image);
images.sort((a, b) => a.top - b.top)
const exists = images.filter(img => (img.top == image.top && img.left == image.left && img.width == image.width && img.height == image.height));
if (!exists.length) {
loading.push(loadImage(image));
images.push(image);
images.sort((a, b) => (a.top - b.top)*10000 + (a.left - b.left));
}
}
}
if (tag == 'page') {
putImage(100000);
}
}
};
const onEndNode = (tag, tail, singleTag, cutCounter, cutTag) => {// eslint-disable-line no-unused-vars
if (inText) {
switch (tag) {
case 'i':
italic = false;
break;
case 'b':
bold = false;
break;
}
}
if (tag == 'text')
inText = false;
};
let buf = this.decode(data).toString();
sax.parseSync(buf, {
onStartNode, onEndNode, onTextNode
onStartNode
});
putImage(100000);
putPageLines();
await Promise.all(loading);
await utils.sleep(100);
//найдем параграфы и отступы
const indents = [];
@@ -187,11 +277,29 @@ class ConvertPdf extends ConvertHtml {
}
indents[0] = 0;
//формируем текст
const limitSize = 2*this.config.maxUploadFileSize;
//title
if (fs.pathExists(metaFile)) {
const metaXmlString = (await fs.readFile(metaFile)).toString();
let metaXmlParsed = xmlParser.parseXml(metaXmlString);
metaXmlParsed = xmlParser.simplifyXmlParsed(metaXmlParsed);
if (metaXmlParsed.metadata) {
title = (metaXmlParsed.metadata.title ? metaXmlParsed.metadata.title._t : '');
author = (metaXmlParsed.metadata.author ? metaXmlParsed.metadata.author._t : '');
}
}
if (!title && uploadFileName)
title = uploadFileName;
let text = `<title>${title}</title>`;
//console.log(JSON.stringify(lines, null, 2));
//формируем текст
const limitSize = 2*this.config.maxUploadFileSize;
let text = '';
if (title)
text += `<fb2-title>${title}</fb2-title>`;
if (author)
text += `<fb2-author>${author}</fb2-author>`;
let concat = '';
let sp = '';
for (const line of lines) {
@@ -204,6 +312,16 @@ class ConvertPdf extends ConvertHtml {
continue;
}
if (line.isSectionTitle) {
text += `<fb2-section-title>${line.text.trim()}</fb2-section-title>`;
continue;
}
if (line.isSubtitle) {
text += `<br><fb2-subtitle>${line.text.trim()}</fb2-subtitle>`;
continue;
}
if (concat == '') {
const left = line.left || 0;
sp = ' '.repeat(indents[left]);
@@ -221,7 +339,9 @@ class ConvertPdf extends ConvertHtml {
if (concat)
text += sp + concat + "\n";
return await super.run(Buffer.from(text), {skipCheck: true, isText: true, cutTitle: true});
//console.log(text);
await utils.sleep(100);
return await super.run(Buffer.from(text), {skipCheck: true, isText: true});
}
}

View File

@@ -48,7 +48,7 @@ class ConvertSites extends ConvertHtml {
if (text === false)
return false;
return await super.run(Buffer.from(text), {skipCheck: true, cutTitle: true});
return await super.run(Buffer.from(text), {skipCheck: true});
}
getTitle(text) {
@@ -79,7 +79,7 @@ class ConvertSites extends ConvertHtml {
let book = this.getTitle(text);
book = book.replace(' (fb2) | Флибуста', '');
const title = `<title>${author}${(author ? ' - ' : '')}${book}</title>`;
const title = `<fb2-title>${author}${(author ? ' - ' : '')}${book}</fb2-title>`;
let begin = '<h3 class="book">';
if (text.indexOf(begin) <= 0)
@@ -95,12 +95,12 @@ class ConvertSites extends ConvertHtml {
return text.substring(l, r)
.replace(/blockquote class="?book"?/g, 'p')
.replace(/<br\/?>\s*<\/h3>/g, '</h3>')
.replace(/<h3 class="?book"?>/g, '<br><br><subtitle>')
.replace(/<h5 class="?book"?>/g, '<br><br><subtitle>')
.replace(/<h3>/g, '<br><br><subtitle>')
.replace(/<h5>/g, '<br><br><subtitle>')
.replace(/<\/h3>/g, '</subtitle><br>')
.replace(/<\/h5>/g, '</subtitle><br>')
.replace(/<h3 class="?book"?>/g, '<br><br><fb2-subtitle>')
.replace(/<h5 class="?book"?>/g, '<br><br><fb2-subtitle>')
.replace(/<h3>/g, '<br><br><fb2-subtitle>')
.replace(/<h5>/g, '<br><br><fb2-subtitle>')
.replace(/<\/h3>/g, '</fb2-subtitle><br>')
.replace(/<\/h5>/g, '</fb2-subtitle><br>')
.replace(/<div class="?stanza"?>/g, '<br>')
.replace(/<div>/g, '<br>')
+ title;

View File

@@ -6,7 +6,8 @@ function parseSync(xstr, options) {
onCdata: _onCdata = dummy,
onComment: _onComment = dummy,
onProgress: _onProgress = dummy,
innerCut = new Set()
innerCut = new Set(),
lowerCase = true,
} = options;
let i = 0;
@@ -91,7 +92,8 @@ function parseSync(xstr, options) {
} else {
tag = tagData;
}
tag = tag.toLowerCase();
if (lowerCase)
tag = tag.toLowerCase();
if (innerCut.has(tag) && (!cutCounter || cutTag === tag)) {
if (!cutCounter)
@@ -146,7 +148,8 @@ async function parse(xstr, options) {
onCdata: _onCdata = dummy,
onComment: _onComment = dummy,
onProgress: _onProgress = dummy,
innerCut = new Set()
innerCut = new Set(),
lowerCase = true,
} = options;
let i = 0;
@@ -231,7 +234,8 @@ async function parse(xstr, options) {
} else {
tag = tagData;
}
tag = tag.toLowerCase();
if (lowerCase)
tag = tag.toLowerCase();
if (innerCut.has(tag) && (!cutCounter || cutTag === tag)) {
if (!cutCounter)
@@ -276,7 +280,7 @@ async function parse(xstr, options) {
await _onProgress(100);
}
function getAttrsSync(tail) {
function getAttrsSync(tail, lowerCase = true) {
let result = {};
let name = '';
let value = '';
@@ -287,13 +291,16 @@ function getAttrsSync(tail) {
let waitEq = false;
const pushResult = () => {
if (lowerCase)
name = name.toLowerCase();
if (name != '') {
const fn = name;
let ns = '';
if (name.indexOf(':') >= 0) {
[ns, name] = name.split(':');
if (fn.indexOf(':') >= 0) {
[ns, name] = fn.split(':');
}
result[name] = {value, ns};
result[name] = {value, ns, fn};
}
name = '';
value = '';

143
server/core/xmlParser.js Normal file
View File

@@ -0,0 +1,143 @@
const sax = require('./sax');
function formatXml(xmlParsed, encoding = 'utf-8', textFilterFunc) {
let out = `<?xml version="1.0" encoding="${encoding}"?>`;
out += formatXmlNode(xmlParsed, textFilterFunc);
return out;
}
function formatXmlNode(node, textFilterFunc) {
textFilterFunc = (textFilterFunc ? textFilterFunc : text => text);
const formatNode = (node, name) => {
let out = '';
if (Array.isArray(node)) {
for (const n of node) {
out += formatNode(n);
}
} else if (typeof node == 'string') {
if (name)
out += `<${name}>${textFilterFunc(node)}</${name}>`;
else
out += textFilterFunc(node);
} else {
if (node._n)
name = node._n;
let attrs = '';
if (node._attrs) {
for (let attrName in node._attrs) {
attrs += ` ${attrName}="${node._attrs[attrName]}"`;
}
}
let tOpen = '';
let tBody = '';
let tClose = '';
if (name)
tOpen += `<${name}${attrs}>`;
if (node.hasOwnProperty('_t'))
tBody += textFilterFunc(node._t);
for (let nodeName in node) {
if (nodeName && nodeName[0] == '_' && nodeName != '_a')
continue;
const n = node[nodeName];
tBody += formatNode(n, nodeName);
}
if (name)
tClose += `</${name}>`;
out += `${tOpen}${tBody}${tClose}`;
}
return out;
}
return formatNode(node);
}
function parseXml(xmlString, lowerCase = true) {
let result = {};
let node = result;
const onTextNode = (text, cutCounter, cutTag) => {// eslint-disable-line no-unused-vars
node._t = text;
};
const onStartNode = (tag, tail, singleTag, cutCounter, cutTag) => {// eslint-disable-line no-unused-vars
if (tag == '?xml')
return;
const newNode = {_n: tag, _p: node};
if (tail) {
const parsedAttrs = sax.getAttrsSync(tail, lowerCase);
const atKeys = Object.keys(parsedAttrs);
if (atKeys.length) {
const attrs = {};
for (let i = 0; i < atKeys.length; i++) {
const attrName = atKeys[i];
attrs[parsedAttrs[attrName].fn] = parsedAttrs[attrName].value;
}
newNode._attrs = attrs;
}
}
if (!node._a)
node._a = [];
node._a.push(newNode);
node = newNode;
};
const onEndNode = (tag, tail, singleTag, cutCounter, cutTag) => {// eslint-disable-line no-unused-vars
if (node._p && node._n == tag)
node = node._p;
};
sax.parseSync(xmlString, {
onStartNode, onEndNode, onTextNode, lowerCase
});
if (result._a)
result = result._a[0];
return result;
}
function simplifyXmlParsed(node) {
const simplifyNodeArray = (a) => {
const result = {};
for (let i = 0; i < a.length; i++) {
const child = a[i];
if (child._n && !result[child._n]) {
result[child._n] = {};
if (child._a) {
result[child._n] = simplifyNodeArray(child._a);
}
if (child._t) {
result[child._n]._t = child._t;
}
if (child._attrs) {
result[child._n]._attrs = child._attrs;
}
}
}
return result;
};
return simplifyNodeArray([node]);
}
module.exports = {
formatXml,
formatXmlNode,
parseXml,
simplifyXmlParsed
}