Улучшение парсинга Pdf

This commit is contained in:
Book Pauk
2019-03-04 21:22:12 +07:00
parent cb65cac333
commit 2f8b68ec62
2 changed files with 30 additions and 4 deletions

View File

@@ -34,10 +34,13 @@ class ConvertHtml extends ConvertBase {
let desc = {_n: 'description', 'title-info': titleInfo};
let pars = [];
let body = {_n: 'body', section: {_a: []}};
let fb2 = [desc, body];
let binary = [];
let fb2 = [desc, body, binary];
let title = '';
let inTitle = false;
let inImage = false;
let image = {};
let spaceCounter = [];
@@ -80,6 +83,15 @@ class ConvertHtml extends ConvertBase {
if (inTitle && !title)
title = text;
if (inImage) {
image._t = text;
binary.push(image);
pars.push({_n: 'image', _attrs: {'l:href': '#' + image._attrs.id}, _t: ''});
newParagraph();
}
};
const onStartNode = (tag, tail, singleTag, cutCounter, cutTag) => {// eslint-disable-line no-unused-vars
@@ -90,18 +102,27 @@ class ConvertHtml extends ConvertBase {
if (tag == 'title')
inTitle = true;
if (tag == 'fb2-image') {
inImage = true;
const attrs = sax.getAttrsSync(tail);
image = {_n: 'binary', _attrs: {id: attrs.name.value, 'content-type': attrs.type.value}, _t: ''};
}
};
const onEndNode = (tag, tail, singleTag, cutCounter, cutTag) => {// eslint-disable-line no-unused-vars
if (tag == 'title')
inTitle = false;
if (tag == 'fb2-image')
inImage = false;
};
let buf = this.decode(data).toString();
sax.parseSync(buf, {
onStartNode, onEndNode, onTextNode,
innerCut: new Set(['head', 'script', 'style', 'binary'])
innerCut: new Set(['head', 'script', 'style', 'binary', 'fb2-image'])
});
titleInfo['book-title'] = title;
@@ -148,6 +169,11 @@ class ConvertHtml extends ConvertBase {
i = 0;
for (const par of pars) {
if (par._n != 'p') {
newPars.push(par);
continue;
}
if (i > 0)
newPar();
i++;

View File

@@ -86,7 +86,7 @@ class ConvertPdf extends ConvertHtml {
height: parseInt((attrs.height && attrs.height.value ? attrs.height.value : null), 10),
};
if (line.width !== '0' || line.height !== '0') {
if (line.width != 0 || line.height != 0) {
inText = true;
if (isNaN(line.top) || isNaN(prevTop) || (Math.abs(prevTop - line.top) > 3)) {
putImage(line.top);
@@ -98,7 +98,7 @@ class ConvertPdf extends ConvertHtml {
}
if (tag == 'image') {
let attrs = sax.getAttrsSync(tail);
const attrs = sax.getAttrsSync(tail);
const src = (attrs.src && attrs.src.value ? attrs.src.value : '');
if (src) {
const image = {