Улучшение парсинга pdf и html
This commit is contained in:
@@ -76,7 +76,7 @@ class ConvertHtml extends ConvertBase {
|
|||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
const newPara = new Set(['tr', '/table', 'hr', 'br', 'br/', 'dd', 'p', 'title', '/title', 'h1', 'h2', 'h3', '/h1', '/h2', '/h3']);
|
const newPara = new Set(['tr', '/table', 'hr', 'br', 'br/', 'li', 'dt', 'dd', 'p', 'title', '/title', 'h1', 'h2', 'h3', '/h1', '/h2', '/h3']);
|
||||||
|
|
||||||
const onTextNode = (text, cutCounter, cutTag) => {// eslint-disable-line no-unused-vars
|
const onTextNode = (text, cutCounter, cutTag) => {// eslint-disable-line no-unused-vars
|
||||||
if (!cutCounter && !(cutTitle && inTitle)) {
|
if (!cutCounter && !(cutTitle && inTitle)) {
|
||||||
|
|||||||
@@ -38,6 +38,8 @@ class ConvertPdf extends ConvertHtml {
|
|||||||
let images = [];
|
let images = [];
|
||||||
let loading = [];
|
let loading = [];
|
||||||
let inText = false;
|
let inText = false;
|
||||||
|
let bold = false;
|
||||||
|
let italic = false;
|
||||||
let title = '';
|
let title = '';
|
||||||
let prevTop = 0;
|
let prevTop = 0;
|
||||||
let i = -1;
|
let i = -1;
|
||||||
@@ -68,7 +70,12 @@ class ConvertPdf extends ConvertHtml {
|
|||||||
|
|
||||||
const onTextNode = (text, cutCounter, cutTag) => {// eslint-disable-line no-unused-vars
|
const onTextNode = (text, cutCounter, cutTag) => {// eslint-disable-line no-unused-vars
|
||||||
if (!cutCounter && inText) {
|
if (!cutCounter && inText) {
|
||||||
lines[i].text += text + ' ';
|
let tOpen = (bold ? '<b>' : '');
|
||||||
|
tOpen += (italic ? '<i>' : '');
|
||||||
|
let tClose = (italic ? '</i>' : '');
|
||||||
|
tClose += (bold ? '</b>' : '');
|
||||||
|
|
||||||
|
lines[i].text += `${tOpen}${text}${tClose} `;
|
||||||
if (i < 2)
|
if (i < 2)
|
||||||
title += text + ' ';
|
title += text + ' ';
|
||||||
}
|
}
|
||||||
@@ -76,6 +83,17 @@ class ConvertPdf extends ConvertHtml {
|
|||||||
|
|
||||||
const onStartNode = (tag, tail, singleTag, cutCounter, cutTag) => {// eslint-disable-line no-unused-vars
|
const onStartNode = (tag, tail, singleTag, cutCounter, cutTag) => {// eslint-disable-line no-unused-vars
|
||||||
if (!cutCounter) {
|
if (!cutCounter) {
|
||||||
|
if (inText) {
|
||||||
|
switch (tag) {
|
||||||
|
case 'i':
|
||||||
|
italic = true;
|
||||||
|
break;
|
||||||
|
case 'b':
|
||||||
|
bold = true;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
if (tag == 'text' && !inText) {
|
if (tag == 'text' && !inText) {
|
||||||
let attrs = sax.getAttrsSync(tail);
|
let attrs = sax.getAttrsSync(tail);
|
||||||
const line = {
|
const line = {
|
||||||
@@ -121,6 +139,17 @@ class ConvertPdf extends ConvertHtml {
|
|||||||
};
|
};
|
||||||
|
|
||||||
const onEndNode = (tag, tail, singleTag, cutCounter, cutTag) => {// eslint-disable-line no-unused-vars
|
const onEndNode = (tag, tail, singleTag, cutCounter, cutTag) => {// eslint-disable-line no-unused-vars
|
||||||
|
if (inText) {
|
||||||
|
switch (tag) {
|
||||||
|
case 'i':
|
||||||
|
italic = false;
|
||||||
|
break;
|
||||||
|
case 'b':
|
||||||
|
bold = false;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
if (tag == 'text')
|
if (tag == 'text')
|
||||||
inText = false;
|
inText = false;
|
||||||
};
|
};
|
||||||
|
|||||||
Reference in New Issue
Block a user