Улучшение парсинга pdf и html

This commit is contained in:
Book Pauk
2019-03-04 22:56:15 +07:00
parent fcb61c89d5
commit b3e579d8b7
2 changed files with 31 additions and 2 deletions

View File

@@ -76,7 +76,7 @@ class ConvertHtml extends ConvertBase {
}
};
const newPara = new Set(['tr', '/table', 'hr', 'br', 'br/', 'dd', 'p', 'title', '/title', 'h1', 'h2', 'h3', '/h1', '/h2', '/h3']);
const newPara = new Set(['tr', '/table', 'hr', 'br', 'br/', 'li', 'dt', 'dd', 'p', 'title', '/title', 'h1', 'h2', 'h3', '/h1', '/h2', '/h3']);
const onTextNode = (text, cutCounter, cutTag) => {// eslint-disable-line no-unused-vars
if (!cutCounter && !(cutTitle && inTitle)) {

View File

@@ -38,6 +38,8 @@ class ConvertPdf extends ConvertHtml {
let images = [];
let loading = [];
let inText = false;
let bold = false;
let italic = false;
let title = '';
let prevTop = 0;
let i = -1;
@@ -68,7 +70,12 @@ class ConvertPdf extends ConvertHtml {
const onTextNode = (text, cutCounter, cutTag) => {// eslint-disable-line no-unused-vars
if (!cutCounter && inText) {
lines[i].text += text + ' ';
let tOpen = (bold ? '<b>' : '');
tOpen += (italic ? '<i>' : '');
let tClose = (italic ? '</i>' : '');
tClose += (bold ? '</b>' : '');
lines[i].text += `${tOpen}${text}${tClose} `;
if (i < 2)
title += text + ' ';
}
@@ -76,6 +83,17 @@ class ConvertPdf extends ConvertHtml {
const onStartNode = (tag, tail, singleTag, cutCounter, cutTag) => {// eslint-disable-line no-unused-vars
if (!cutCounter) {
if (inText) {
switch (tag) {
case 'i':
italic = true;
break;
case 'b':
bold = true;
break;
}
}
if (tag == 'text' && !inText) {
let attrs = sax.getAttrsSync(tail);
const line = {
@@ -121,6 +139,17 @@ class ConvertPdf extends ConvertHtml {
};
const onEndNode = (tag, tail, singleTag, cutCounter, cutTag) => {// eslint-disable-line no-unused-vars
if (inText) {
switch (tag) {
case 'i':
italic = false;
break;
case 'b':
bold = false;
break;
}
}
if (tag == 'text')
inText = false;
};