Работа над конвертером pdf

This commit is contained in:
Book Pauk
2020-12-14 02:22:38 +07:00
parent 17699f66f8
commit 9906dd43c7

View File

@@ -55,6 +55,10 @@ class ConvertPdf extends ConvertHtml {
let images = [];
let loading = [];
let inText = false;
let bold = false;
let italic = false;
let i = -1;
const loadImage = async(image) => {
@@ -81,22 +85,30 @@ class ConvertPdf extends ConvertHtml {
}
};
const isTextBold = (text) => {
const m = text.trim().match(/^<b>(.*)<\/b>$/);
return m && !m[1].match(/<b>|<\/b>|<i>|<\/i>/g);
};
const isTextEmpty = (text) => {
return text.replace(/<b>|<\/b>|<i>|<\/i>/g, '').trim() == '';
};
const putPageLines = () => {
pagelines.sort((a, b) => (a.top - b.top)*10000 + (a.left - b.left))
pagelines.sort((a, b) => (Math.abs(a.top - b.top) > 3 ? a.top - b.top : 0)*10000 + (a.left - b.left))
//объединяем в одну строку равные по высоте
const pl = [];
let pt = 0;
let j = -1;
pagelines.forEach(line => {
//добавим закрывающий тег стиля
line.text += line.tClose;
if (isTextEmpty(line.text))
return;
//проверим, возможно это заголовок
if (line.fonts.length == 1 && line.pageWidth) {
const f = (line.fonts.length ? fonts[line.fonts[0]] : null);
if (line.fontId && line.pageWidth) {
const centerLeft = (line.pageWidth - line.width)/2;
if (f && f.isBold && Math.abs(centerLeft - line.left) < 3) {
if (isTextBold(line.text) && Math.abs(centerLeft - line.left) < 10) {
if (!sectionTitleFound) {
line.isSectionTitle = true;
sectionTitleFound = true;
@@ -124,8 +136,8 @@ class ConvertPdf extends ConvertHtml {
//добавим пустую строку, если надо
const prevLine = (i > lastIndex ? lines[i] : {fonts: [], top: 0});
if (prevLine && !prevLine.isImage) {
const f = (prevLine.fonts.length ? fonts[prevLine.fonts[0]] : (line.fonts.length ? fonts[line.fonts[0]] : null));
if (f && f.fontSize && !line.isImage && line.top - prevLine.top > f.fontSize*1.8) {
const f = (prevLine.fontId ? fonts[prevLine.fontId] : (line.fontId ? fonts[line.fontId] : null));
if (f && f.fontSize && !line.isImage && line.top - prevLine.top > f.fontSize * 1.8) {
i++;
lines[i] = {text: '<br>'};
}
@@ -138,29 +150,26 @@ class ConvertPdf extends ConvertHtml {
putImage(100000);
};
const onTextNode = (text, cutCounter, cutTag) => {// eslint-disable-line no-unused-vars
if (!cutCounter && inText) {
let tOpen = (bold ? '<b>' : '');
tOpen += (italic ? '<i>' : '');
let tClose = (italic ? '</i>' : '');
tClose += (bold ? '</b>' : '');
line.text += ` ${tOpen}${text}${tClose}`;
}
};
const onStartNode = (tag, tail, singleTag, cutCounter, cutTag) => {// eslint-disable-line no-unused-vars
if (tag == 'textstyle') {
const attrs = sax.getAttrsSync(tail);
const fontId = (attrs.id && attrs.id.value ? attrs.id.value : '');
const fontStyle = (attrs.fontstyle && attrs.fontstyle.value ? attrs.fontstyle.value : '');
const fontSize = (attrs.fontsize && attrs.fontsize.value ? attrs.fontsize.value : '');
if (fontId) {
const styleTags = {bold: 'b', italics: 'i', superscript: 'sup', subscript: 'sub'};
const f = fonts[fontId] = {tOpen: '', tClose: '', isBold: false, fontSize};
if (fontStyle) {
const styles = fontStyle.split(' ');
styles.forEach(style => {
const s = styleTags[style];
if (s) {
f.tOpen += `<${s}>`;
f.tClose = `</${s}>${f.tClose}`;
if (s == 'b')
f.isBold = true;
}
});
}
if (inText) {
switch (tag) {
case 'i':
italic = true;
break;
case 'b':
bold = true;
break;
}
}
@@ -173,80 +182,78 @@ class ConvertPdf extends ConvertHtml {
putPageLines();
}
if (tag == 'textline') {
if (tag == 'fontspec') {
const attrs = sax.getAttrsSync(tail);
const fontId = (attrs.id && attrs.id.value ? attrs.id.value : '');
const fontSize = (attrs.size && attrs.size.value ? attrs.size.value : '');
if (fontId) {
fonts[fontId] = {fontSize};
}
}
if (tag == 'text' && !inText) {
const attrs = sax.getAttrsSync(tail);
line = {
text: '',
top: parseInt((attrs.vpos && attrs.vpos.value ? attrs.vpos.value : null), 10),
left: parseInt((attrs.hpos && attrs.hpos.value ? attrs.hpos.value : null), 10),
top: parseInt((attrs.top && attrs.top.value ? attrs.top.value : null), 10),
left: parseInt((attrs.left && attrs.left.value ? attrs.left.value : null), 10),
width: parseInt((attrs.width && attrs.width.value ? attrs.width.value : null), 10),
height: parseInt((attrs.height && attrs.height.value ? attrs.height.value : null), 10),
tOpen: '',
tClose: '',
isSectionTitle: false,
isSubtitle: false,
pageWidth: page.width,
fonts: [],
fontId: (attrs.font && attrs.font.value ? attrs.font.value : ''),
};
if (line.width != 0 || line.height != 0) {
inText = true;
pagelines.push(line);
}
}
if (tag == 'string') {
if (tag == 'image') {
const attrs = sax.getAttrsSync(tail);
if (attrs.content && attrs.content.value) {
let src = (attrs.src && attrs.src.value ? attrs.src.value : '');
if (src) {
const image = {
isImage: true,
src,
data: '',
type: '',
top: parseInt((attrs.top && attrs.top.value ? attrs.top.value : null), 10) || 0,
left: parseInt((attrs.left && attrs.left.value ? attrs.left.value : null), 10) || 0,
width: parseInt((attrs.width && attrs.width.value ? attrs.width.value : null), 10) || 0,
height: parseInt((attrs.height && attrs.height.value ? attrs.height.value : null), 10) || 0,
};
let tOpen = '';
let tClose = '';
const fontId = (attrs.stylerefs && attrs.stylerefs.value ? attrs.stylerefs.value : '');
if (fontId && fonts[fontId]) {
tOpen = fonts[fontId].tOpen;
tClose = fonts[fontId].tClose;
if (!line.fonts.length || line.fonts[0] != fontId)
line.fonts.push(fontId);
}
if (line.tOpen != tOpen) {
line.text += line.tClose + tOpen;
line.tOpen = tOpen;
line.tClose = tClose;
}
line.text += `${line.text.length ? ' ' : ''}${attrs.content.value}`;
}
}
if (tag == 'illustration') {
const attrs = sax.getAttrsSync(tail);
if (attrs.type && attrs.type.value == 'image') {
let src = (attrs.fileid && attrs.fileid.value ? attrs.fileid.value : '');
if (src) {
const image = {
isImage: true,
src,
data: '',
type: '',
top: parseInt((attrs.vpos && attrs.vpos.value ? attrs.vpos.value : null), 10) || 0,
left: parseInt((attrs.hpos && attrs.hpos.value ? attrs.hpos.value : null), 10) || 0,
width: parseInt((attrs.width && attrs.width.value ? attrs.width.value : null), 10) || 0,
height: parseInt((attrs.height && attrs.height.value ? attrs.height.value : null), 10) || 0,
};
const exists = images.filter(img => (img.top == image.top && img.left == image.left && img.width == image.width && img.height == image.height));
if (!exists.length) {
loading.push(loadImage(image));
images.push(image);
images.sort((a, b) => (a.top - b.top)*10000 + (a.left - b.left));
}
}
loading.push(loadImage(image));
images.push(image);
images.sort((a, b) => (a.top - b.top)*10000 + (a.left - b.left));
}
}
};
const onEndNode = (tag, tail, singleTag, cutCounter, cutTag) => {// eslint-disable-line no-unused-vars
if (inText) {
switch (tag) {
case 'i':
italic = false;
break;
case 'b':
bold = false;
break;
}
}
if (tag == 'text')
inText = false;
};
let buf = this.decode(data).toString();
sax.parseSync(buf, {
onStartNode
onStartNode, onEndNode, onTextNode
});
putPageLines();
@@ -290,6 +297,7 @@ class ConvertPdf extends ConvertHtml {
let concat = '';
let sp = '';
let firstLine = true;
for (const line of lines) {
if (text.length > limitSize) {
throw new Error(`Файл для конвертирования слишком большой|FORLOG| text.length: ${text.length} > ${limitSize}`);
@@ -301,10 +309,15 @@ class ConvertPdf extends ConvertHtml {
}
if (line.isSectionTitle) {
text += `<fb2-section-title>${line.text.trim()}</fb2-section-title>`;
if (firstLine)
text += `<fb2-section-title>${line.text.trim()}</fb2-section-title>`;
else
text += `<fb2-subtitle>${line.text.trim()}</fb2-subtitle>`;
continue;
}
firstLine = false;
if (line.isSubtitle) {
text += `<br><fb2-subtitle>${line.text.trim()}</fb2-subtitle>`;
continue;