Работа над конвертером pdf

This commit is contained in:
Book Pauk
2020-12-09 03:06:15 +07:00
parent 15f02c7115
commit 522f953b4f
2 changed files with 52 additions and 13 deletions

View File

@@ -257,20 +257,18 @@ class ConvertHtml extends ConvertBase {
newPar(); newPar();
let j = 0;
const lines = par._t.split('\n'); const lines = par._t.split('\n');
for (let line of lines) { for (let j = 0; j < lines.length; j++) {
line = repCrLfTab(line); const line = repCrLfTab(lines[j]);
let l = 0; let l = 0;
while (l < line.length && line[l] == ' ') { while (l < line.length && line[l] == ' ') {
l++; l++;
} }
if (l >= parIndent || line == '') { if ((j > 0 && l >= parIndent) ||
if (j > 0) (j < lines.length - 1 && line == '') ){
newPar(); newPar();
j++;
} }
curPar._t += line.trim() + ' '; curPar._t += line.trim() + ' ';

View File

@@ -1,4 +1,4 @@
const _ = require('lodash'); //const _ = require('lodash');
const fs = require('fs-extra'); const fs = require('fs-extra');
const path = require('path'); const path = require('path');
@@ -44,10 +44,13 @@ class ConvertPdf extends ConvertHtml {
const data = await fs.readFile(outFile); const data = await fs.readFile(outFile);
callback(90); callback(90);
await utils.sleep(100);
//парсим xml //парсим xml
let lines = []; let lines = [];
let pagelines = []; let pagelines = [];
let line = {text: ''}; let line = {text: ''};
let fonts = {};
let images = []; let images = [];
let loading = []; let loading = [];
@@ -108,6 +111,26 @@ class ConvertPdf extends ConvertHtml {
}; };
const onStartNode = (tag, tail, singleTag, cutCounter, cutTag) => {// eslint-disable-line no-unused-vars const onStartNode = (tag, tail, singleTag, cutCounter, cutTag) => {// eslint-disable-line no-unused-vars
if (tag == 'textstyle') {
const attrs = sax.getAttrsSync(tail);
const fontId = (attrs.id && attrs.id.value ? attrs.id.value : '');
const fontStyle = (attrs.fontstyle && attrs.fontstyle.value ? attrs.fontstyle.value : '');
if (fontId && fontStyle) {
const styles = fontStyle.split(' ');
const styleTags = {bold: 'b', italics: 'i', superscript: 'sup', subscript: 'sub'};
const f = fonts[fontId] = {tOpen: '', tClose: ''};
styles.forEach(style => {
const s = styleTags[style];
if (s) {
f.tOpen += `<${s}>`;
f.tClose = `</${s}>${f.tClose}`;
}
});
}
}
if (tag == 'page') { if (tag == 'page') {
putPageLines(); putPageLines();
putImage(100000); putImage(100000);
@@ -125,6 +148,7 @@ class ConvertPdf extends ConvertHtml {
if (line.width != 0 || line.height != 0) { if (line.width != 0 || line.height != 0) {
if (Math.abs(prevTop - line.top) > 3) { if (Math.abs(prevTop - line.top) > 3) {
putImage(line.top);
pagelines.push(line); pagelines.push(line);
} }
prevTop = line.top; prevTop = line.top;
@@ -134,7 +158,16 @@ class ConvertPdf extends ConvertHtml {
if (tag == 'string') { if (tag == 'string') {
const attrs = sax.getAttrsSync(tail); const attrs = sax.getAttrsSync(tail);
if (attrs.content && attrs.content.value) { if (attrs.content && attrs.content.value) {
line.text += `${attrs.content.value} `;
let tOpen = '';
let tClose = '';
const fontId = (attrs.stylerefs && attrs.stylerefs.value ? attrs.stylerefs.value : '');
if (fontId && fonts[fontId]) {
tOpen = fonts[fontId].tOpen;
tClose = fonts[fontId].tClose;
}
line.text += `${tOpen}${attrs.content.value}${tClose} `;
} }
} }
@@ -149,10 +182,16 @@ class ConvertPdf extends ConvertHtml {
data: '', data: '',
type: '', type: '',
top: parseInt((attrs.vpos && attrs.vpos.value ? attrs.vpos.value : null), 10) || 0, top: parseInt((attrs.vpos && attrs.vpos.value ? attrs.vpos.value : null), 10) || 0,
left: parseInt((attrs.hpos && attrs.hpos.value ? attrs.hpos.value : null), 10) || 0,
width: parseInt((attrs.width && attrs.width.value ? attrs.width.value : null), 10) || 0,
height: parseInt((attrs.height && attrs.height.value ? attrs.height.value : null), 10) || 0,
}; };
const exists = images.filter(img => (img.top == image.top && img.left == image.left && img.width == image.width && img.height == image.height));
if (!exists.length) {
loading.push(loadImage(image)); loading.push(loadImage(image));
images.push(image); images.push(image);
images.sort((a, b) => a.top - b.top) images.sort((a, b) => (a.top - b.top)*10000 + (a.left - b.left));
}
} }
} }
} }
@@ -167,6 +206,7 @@ class ConvertPdf extends ConvertHtml {
putImage(100000); putImage(100000);
await Promise.all(loading); await Promise.all(loading);
await utils.sleep(100);
//найдем параграфы и отступы //найдем параграфы и отступы
const indents = []; const indents = [];
@@ -236,6 +276,7 @@ class ConvertPdf extends ConvertHtml {
if (concat) if (concat)
text += sp + concat + "\n"; text += sp + concat + "\n";
await utils.sleep(100);
return await super.run(Buffer.from(text), {skipCheck: true, isText: true}); return await super.run(Buffer.from(text), {skipCheck: true, isText: true});
} }
} }