Улучшение парсинга Pdf
This commit is contained in:
@@ -34,10 +34,13 @@ class ConvertHtml extends ConvertBase {
|
|||||||
let desc = {_n: 'description', 'title-info': titleInfo};
|
let desc = {_n: 'description', 'title-info': titleInfo};
|
||||||
let pars = [];
|
let pars = [];
|
||||||
let body = {_n: 'body', section: {_a: []}};
|
let body = {_n: 'body', section: {_a: []}};
|
||||||
let fb2 = [desc, body];
|
let binary = [];
|
||||||
|
let fb2 = [desc, body, binary];
|
||||||
|
|
||||||
let title = '';
|
let title = '';
|
||||||
let inTitle = false;
|
let inTitle = false;
|
||||||
|
let inImage = false;
|
||||||
|
let image = {};
|
||||||
|
|
||||||
let spaceCounter = [];
|
let spaceCounter = [];
|
||||||
|
|
||||||
@@ -80,6 +83,15 @@ class ConvertHtml extends ConvertBase {
|
|||||||
|
|
||||||
if (inTitle && !title)
|
if (inTitle && !title)
|
||||||
title = text;
|
title = text;
|
||||||
|
|
||||||
|
if (inImage) {
|
||||||
|
image._t = text;
|
||||||
|
binary.push(image);
|
||||||
|
|
||||||
|
pars.push({_n: 'image', _attrs: {'l:href': '#' + image._attrs.id}, _t: ''});
|
||||||
|
newParagraph();
|
||||||
|
}
|
||||||
|
|
||||||
};
|
};
|
||||||
|
|
||||||
const onStartNode = (tag, tail, singleTag, cutCounter, cutTag) => {// eslint-disable-line no-unused-vars
|
const onStartNode = (tag, tail, singleTag, cutCounter, cutTag) => {// eslint-disable-line no-unused-vars
|
||||||
@@ -90,18 +102,27 @@ class ConvertHtml extends ConvertBase {
|
|||||||
|
|
||||||
if (tag == 'title')
|
if (tag == 'title')
|
||||||
inTitle = true;
|
inTitle = true;
|
||||||
|
|
||||||
|
if (tag == 'fb2-image') {
|
||||||
|
inImage = true;
|
||||||
|
const attrs = sax.getAttrsSync(tail);
|
||||||
|
image = {_n: 'binary', _attrs: {id: attrs.name.value, 'content-type': attrs.type.value}, _t: ''};
|
||||||
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
const onEndNode = (tag, tail, singleTag, cutCounter, cutTag) => {// eslint-disable-line no-unused-vars
|
const onEndNode = (tag, tail, singleTag, cutCounter, cutTag) => {// eslint-disable-line no-unused-vars
|
||||||
if (tag == 'title')
|
if (tag == 'title')
|
||||||
inTitle = false;
|
inTitle = false;
|
||||||
|
|
||||||
|
if (tag == 'fb2-image')
|
||||||
|
inImage = false;
|
||||||
};
|
};
|
||||||
|
|
||||||
let buf = this.decode(data).toString();
|
let buf = this.decode(data).toString();
|
||||||
|
|
||||||
sax.parseSync(buf, {
|
sax.parseSync(buf, {
|
||||||
onStartNode, onEndNode, onTextNode,
|
onStartNode, onEndNode, onTextNode,
|
||||||
innerCut: new Set(['head', 'script', 'style', 'binary'])
|
innerCut: new Set(['head', 'script', 'style', 'binary', 'fb2-image'])
|
||||||
});
|
});
|
||||||
|
|
||||||
titleInfo['book-title'] = title;
|
titleInfo['book-title'] = title;
|
||||||
@@ -148,6 +169,11 @@ class ConvertHtml extends ConvertBase {
|
|||||||
|
|
||||||
i = 0;
|
i = 0;
|
||||||
for (const par of pars) {
|
for (const par of pars) {
|
||||||
|
if (par._n != 'p') {
|
||||||
|
newPars.push(par);
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
if (i > 0)
|
if (i > 0)
|
||||||
newPar();
|
newPar();
|
||||||
i++;
|
i++;
|
||||||
|
|||||||
@@ -86,7 +86,7 @@ class ConvertPdf extends ConvertHtml {
|
|||||||
height: parseInt((attrs.height && attrs.height.value ? attrs.height.value : null), 10),
|
height: parseInt((attrs.height && attrs.height.value ? attrs.height.value : null), 10),
|
||||||
};
|
};
|
||||||
|
|
||||||
if (line.width !== '0' || line.height !== '0') {
|
if (line.width != 0 || line.height != 0) {
|
||||||
inText = true;
|
inText = true;
|
||||||
if (isNaN(line.top) || isNaN(prevTop) || (Math.abs(prevTop - line.top) > 3)) {
|
if (isNaN(line.top) || isNaN(prevTop) || (Math.abs(prevTop - line.top) > 3)) {
|
||||||
putImage(line.top);
|
putImage(line.top);
|
||||||
@@ -98,7 +98,7 @@ class ConvertPdf extends ConvertHtml {
|
|||||||
}
|
}
|
||||||
|
|
||||||
if (tag == 'image') {
|
if (tag == 'image') {
|
||||||
let attrs = sax.getAttrsSync(tail);
|
const attrs = sax.getAttrsSync(tail);
|
||||||
const src = (attrs.src && attrs.src.value ? attrs.src.value : '');
|
const src = (attrs.src && attrs.src.value ? attrs.src.value : '');
|
||||||
if (src) {
|
if (src) {
|
||||||
const image = {
|
const image = {
|
||||||
|
|||||||
Reference in New Issue
Block a user