Улучшение парсинга html

This commit is contained in:
Book Pauk
2019-03-16 16:40:31 +07:00
parent e800dfe796
commit 983d9ee1b9
3 changed files with 9 additions and 0 deletions

View File

@@ -1,6 +1,7 @@
const fs = require('fs-extra');
const iconv = require('iconv-lite');
const chardet = require('chardet');
const he = require('he');
const textUtils = require('./textUtils');
const utils = require('../utils');
@@ -80,6 +81,10 @@ class ConvertBase {
return text.replace(/ |[\t\n\r]/g, ' ');
}
escapeEntities(text) {
return he.escape(text);
}
formatFb2(fb2) {
let out = '<?xml version="1.0" encoding="utf-8"?>';
out += '<FictionBook xmlns="http://www.gribuser.ru/xml/fictionbook/2.0" xmlns:l="http://www.w3.org/1999/xlink">';

View File

@@ -79,6 +79,8 @@ class ConvertHtml extends ConvertBase {
const newPara = new Set(['tr', '/table', 'hr', 'br', 'br/', 'li', 'dt', 'dd', 'p', 'title', '/title', 'h1', 'h2', 'h3', '/h1', '/h2', '/h3']);
const onTextNode = (text, cutCounter, cutTag) => {// eslint-disable-line no-unused-vars
text = this.escapeEntities(text);
if (!cutCounter && !(cutTitle && inTitle)) {
let tOpen = (bold ? '<strong>' : '');
tOpen += (italic ? '<emphasis>' : '');

View File

@@ -218,6 +218,8 @@ class ConvertSamlib extends ConvertBase {
if (!text)
return;
text = this.escapeEntities(text);
switch (path) {
case '/html/body/center/h2':
titleInfo['book-title'] = text;