Работа над конвертером pdf

This commit is contained in:
Book Pauk
2020-12-14 02:22:38 +07:00
parent 17699f66f8
commit 9906dd43c7

View File

@@ -55,6 +55,10 @@ class ConvertPdf extends ConvertHtml {
let images = []; let images = [];
let loading = []; let loading = [];
let inText = false;
let bold = false;
let italic = false;
let i = -1; let i = -1;
const loadImage = async(image) => { const loadImage = async(image) => {
@@ -81,22 +85,30 @@ class ConvertPdf extends ConvertHtml {
} }
}; };
const isTextBold = (text) => {
const m = text.trim().match(/^<b>(.*)<\/b>$/);
return m && !m[1].match(/<b>|<\/b>|<i>|<\/i>/g);
};
const isTextEmpty = (text) => {
return text.replace(/<b>|<\/b>|<i>|<\/i>/g, '').trim() == '';
};
const putPageLines = () => { const putPageLines = () => {
pagelines.sort((a, b) => (a.top - b.top)*10000 + (a.left - b.left)) pagelines.sort((a, b) => (Math.abs(a.top - b.top) > 3 ? a.top - b.top : 0)*10000 + (a.left - b.left))
//объединяем в одну строку равные по высоте //объединяем в одну строку равные по высоте
const pl = []; const pl = [];
let pt = 0; let pt = 0;
let j = -1; let j = -1;
pagelines.forEach(line => { pagelines.forEach(line => {
//добавим закрывающий тег стиля if (isTextEmpty(line.text))
line.text += line.tClose; return;
//проверим, возможно это заголовок //проверим, возможно это заголовок
if (line.fonts.length == 1 && line.pageWidth) { if (line.fontId && line.pageWidth) {
const f = (line.fonts.length ? fonts[line.fonts[0]] : null);
const centerLeft = (line.pageWidth - line.width)/2; const centerLeft = (line.pageWidth - line.width)/2;
if (f && f.isBold && Math.abs(centerLeft - line.left) < 3) { if (isTextBold(line.text) && Math.abs(centerLeft - line.left) < 10) {
if (!sectionTitleFound) { if (!sectionTitleFound) {
line.isSectionTitle = true; line.isSectionTitle = true;
sectionTitleFound = true; sectionTitleFound = true;
@@ -124,8 +136,8 @@ class ConvertPdf extends ConvertHtml {
//добавим пустую строку, если надо //добавим пустую строку, если надо
const prevLine = (i > lastIndex ? lines[i] : {fonts: [], top: 0}); const prevLine = (i > lastIndex ? lines[i] : {fonts: [], top: 0});
if (prevLine && !prevLine.isImage) { if (prevLine && !prevLine.isImage) {
const f = (prevLine.fonts.length ? fonts[prevLine.fonts[0]] : (line.fonts.length ? fonts[line.fonts[0]] : null)); const f = (prevLine.fontId ? fonts[prevLine.fontId] : (line.fontId ? fonts[line.fontId] : null));
if (f && f.fontSize && !line.isImage && line.top - prevLine.top > f.fontSize*1.8) { if (f && f.fontSize && !line.isImage && line.top - prevLine.top > f.fontSize * 1.8) {
i++; i++;
lines[i] = {text: '<br>'}; lines[i] = {text: '<br>'};
} }
@@ -138,29 +150,26 @@ class ConvertPdf extends ConvertHtml {
putImage(100000); putImage(100000);
}; };
const onTextNode = (text, cutCounter, cutTag) => {// eslint-disable-line no-unused-vars
if (!cutCounter && inText) {
let tOpen = (bold ? '<b>' : '');
tOpen += (italic ? '<i>' : '');
let tClose = (italic ? '</i>' : '');
tClose += (bold ? '</b>' : '');
line.text += ` ${tOpen}${text}${tClose}`;
}
};
const onStartNode = (tag, tail, singleTag, cutCounter, cutTag) => {// eslint-disable-line no-unused-vars const onStartNode = (tag, tail, singleTag, cutCounter, cutTag) => {// eslint-disable-line no-unused-vars
if (tag == 'textstyle') { if (inText) {
const attrs = sax.getAttrsSync(tail); switch (tag) {
const fontId = (attrs.id && attrs.id.value ? attrs.id.value : ''); case 'i':
const fontStyle = (attrs.fontstyle && attrs.fontstyle.value ? attrs.fontstyle.value : ''); italic = true;
const fontSize = (attrs.fontsize && attrs.fontsize.value ? attrs.fontsize.value : ''); break;
case 'b':
if (fontId) { bold = true;
const styleTags = {bold: 'b', italics: 'i', superscript: 'sup', subscript: 'sub'}; break;
const f = fonts[fontId] = {tOpen: '', tClose: '', isBold: false, fontSize};
if (fontStyle) {
const styles = fontStyle.split(' ');
styles.forEach(style => {
const s = styleTags[style];
if (s) {
f.tOpen += `<${s}>`;
f.tClose = `</${s}>${f.tClose}`;
if (s == 'b')
f.isBold = true;
}
});
}
} }
} }
@@ -173,80 +182,78 @@ class ConvertPdf extends ConvertHtml {
putPageLines(); putPageLines();
} }
if (tag == 'textline') { if (tag == 'fontspec') {
const attrs = sax.getAttrsSync(tail);
const fontId = (attrs.id && attrs.id.value ? attrs.id.value : '');
const fontSize = (attrs.size && attrs.size.value ? attrs.size.value : '');
if (fontId) {
fonts[fontId] = {fontSize};
}
}
if (tag == 'text' && !inText) {
const attrs = sax.getAttrsSync(tail); const attrs = sax.getAttrsSync(tail);
line = { line = {
text: '', text: '',
top: parseInt((attrs.vpos && attrs.vpos.value ? attrs.vpos.value : null), 10), top: parseInt((attrs.top && attrs.top.value ? attrs.top.value : null), 10),
left: parseInt((attrs.hpos && attrs.hpos.value ? attrs.hpos.value : null), 10), left: parseInt((attrs.left && attrs.left.value ? attrs.left.value : null), 10),
width: parseInt((attrs.width && attrs.width.value ? attrs.width.value : null), 10), width: parseInt((attrs.width && attrs.width.value ? attrs.width.value : null), 10),
height: parseInt((attrs.height && attrs.height.value ? attrs.height.value : null), 10), height: parseInt((attrs.height && attrs.height.value ? attrs.height.value : null), 10),
tOpen: '',
tClose: '',
isSectionTitle: false, isSectionTitle: false,
isSubtitle: false, isSubtitle: false,
pageWidth: page.width, pageWidth: page.width,
fonts: [], fontId: (attrs.font && attrs.font.value ? attrs.font.value : ''),
}; };
if (line.width != 0 || line.height != 0) { if (line.width != 0 || line.height != 0) {
inText = true;
pagelines.push(line); pagelines.push(line);
} }
} }
if (tag == 'string') { if (tag == 'image') {
const attrs = sax.getAttrsSync(tail); const attrs = sax.getAttrsSync(tail);
if (attrs.content && attrs.content.value) { let src = (attrs.src && attrs.src.value ? attrs.src.value : '');
if (src) {
const image = {
isImage: true,
src,
data: '',
type: '',
top: parseInt((attrs.top && attrs.top.value ? attrs.top.value : null), 10) || 0,
left: parseInt((attrs.left && attrs.left.value ? attrs.left.value : null), 10) || 0,
width: parseInt((attrs.width && attrs.width.value ? attrs.width.value : null), 10) || 0,
height: parseInt((attrs.height && attrs.height.value ? attrs.height.value : null), 10) || 0,
};
let tOpen = ''; loading.push(loadImage(image));
let tClose = ''; images.push(image);
const fontId = (attrs.stylerefs && attrs.stylerefs.value ? attrs.stylerefs.value : ''); images.sort((a, b) => (a.top - b.top)*10000 + (a.left - b.left));
if (fontId && fonts[fontId]) {
tOpen = fonts[fontId].tOpen;
tClose = fonts[fontId].tClose;
if (!line.fonts.length || line.fonts[0] != fontId)
line.fonts.push(fontId);
}
if (line.tOpen != tOpen) {
line.text += line.tClose + tOpen;
line.tOpen = tOpen;
line.tClose = tClose;
}
line.text += `${line.text.length ? ' ' : ''}${attrs.content.value}`;
}
}
if (tag == 'illustration') {
const attrs = sax.getAttrsSync(tail);
if (attrs.type && attrs.type.value == 'image') {
let src = (attrs.fileid && attrs.fileid.value ? attrs.fileid.value : '');
if (src) {
const image = {
isImage: true,
src,
data: '',
type: '',
top: parseInt((attrs.vpos && attrs.vpos.value ? attrs.vpos.value : null), 10) || 0,
left: parseInt((attrs.hpos && attrs.hpos.value ? attrs.hpos.value : null), 10) || 0,
width: parseInt((attrs.width && attrs.width.value ? attrs.width.value : null), 10) || 0,
height: parseInt((attrs.height && attrs.height.value ? attrs.height.value : null), 10) || 0,
};
const exists = images.filter(img => (img.top == image.top && img.left == image.left && img.width == image.width && img.height == image.height));
if (!exists.length) {
loading.push(loadImage(image));
images.push(image);
images.sort((a, b) => (a.top - b.top)*10000 + (a.left - b.left));
}
}
} }
} }
}; };
const onEndNode = (tag, tail, singleTag, cutCounter, cutTag) => {// eslint-disable-line no-unused-vars
if (inText) {
switch (tag) {
case 'i':
italic = false;
break;
case 'b':
bold = false;
break;
}
}
if (tag == 'text')
inText = false;
};
let buf = this.decode(data).toString(); let buf = this.decode(data).toString();
sax.parseSync(buf, { sax.parseSync(buf, {
onStartNode onStartNode, onEndNode, onTextNode
}); });
putPageLines(); putPageLines();
@@ -290,6 +297,7 @@ class ConvertPdf extends ConvertHtml {
let concat = ''; let concat = '';
let sp = ''; let sp = '';
let firstLine = true;
for (const line of lines) { for (const line of lines) {
if (text.length > limitSize) { if (text.length > limitSize) {
throw new Error(`Файл для конвертирования слишком большой|FORLOG| text.length: ${text.length} > ${limitSize}`); throw new Error(`Файл для конвертирования слишком большой|FORLOG| text.length: ${text.length} > ${limitSize}`);
@@ -301,10 +309,15 @@ class ConvertPdf extends ConvertHtml {
} }
if (line.isSectionTitle) { if (line.isSectionTitle) {
text += `<fb2-section-title>${line.text.trim()}</fb2-section-title>`; if (firstLine)
text += `<fb2-section-title>${line.text.trim()}</fb2-section-title>`;
else
text += `<fb2-subtitle>${line.text.trim()}</fb2-subtitle>`;
continue; continue;
} }
firstLine = false;
if (line.isSubtitle) { if (line.isSubtitle) {
text += `<br><fb2-subtitle>${line.text.trim()}</fb2-subtitle>`; text += `<br><fb2-subtitle>${line.text.trim()}</fb2-subtitle>`;
continue; continue;