From 9c20df510d433acf1d205c7906420a3203e7ec68 Mon Sep 17 00:00:00 2001 From: Book Pauk Date: Tue, 12 Feb 2019 22:52:12 +0700 Subject: [PATCH] =?UTF-8?q?=D0=A3=D0=BB=D1=83=D1=87=D1=88=D0=B8=D0=BB=20?= =?UTF-8?q?=D0=BF=D0=BE=D0=B4=D0=B4=D0=B5=D1=80=D0=B6=D0=BA=D1=83=20=D1=82?= =?UTF-8?q?=D0=B5=D0=BA=D1=81=D1=82=D0=BE=D0=B2=D1=8B=D1=85=20=D1=84=D0=B0?= =?UTF-8?q?=D0=B9=D0=BB=D0=BE=D0=B2?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- server/core/BookConverter/getEncoding.js | 68 -------------------- server/core/BookConverter/index.js | 24 ++++--- server/core/BookConverter/textUtils.js | 82 ++++++++++++++++++++++++ 3 files changed, 98 insertions(+), 76 deletions(-) delete mode 100644 server/core/BookConverter/getEncoding.js create mode 100644 server/core/BookConverter/textUtils.js diff --git a/server/core/BookConverter/getEncoding.js b/server/core/BookConverter/getEncoding.js deleted file mode 100644 index d5533bc8..00000000 --- a/server/core/BookConverter/getEncoding.js +++ /dev/null @@ -1,68 +0,0 @@ -function getEncoding(buf) { - const lowerCase = 3; - const upperCase = 1; - - const codePage = { - 'k': 'koi8-r', - 'w': 'Windows-1251', - 'd': 'cp866', - 'i': 'ISO-8859-5', - 'm': 'maccyrillic', - }; - - let charsets = { - 'k': 0, - 'w': 0, - 'd': 0, - 'i': 0, - 'm': 0 - }; - - const len = buf.length; - const blockSize = (len > 5*3000 ? 3000 : len); - let counter = 0; - let i = 0; - while (i < len) { - const char = buf[i]; - i++; - //non-russian characters - if (char < 128 || char > 256) - continue; - //CP866 - if ((char > 159 && char < 176) || (char > 223 && char < 242)) charsets['d'] += lowerCase; - if ((char > 127 && char < 160)) charsets['d'] += upperCase; - - //KOI8-R - if ((char > 191 && char < 223)) charsets['k'] += lowerCase; - if ((char > 222 && char < 256)) charsets['k'] += upperCase; - - //WIN-1251 - if (char > 223 && char < 256) charsets['w'] += lowerCase; - if (char > 191 && char < 224) charsets['w'] += upperCase; - - //MAC - if (char > 221 && char < 255) charsets['m'] += lowerCase; - if (char > 127 && char < 160) charsets['m'] += upperCase; - - //ISO-8859-5 - if (char > 207 && char < 240) charsets['i'] += lowerCase; - if (char > 175 && char < 208) charsets['i'] += upperCase; - - counter++; - - if (counter > blockSize) { - counter = 0; - i += Math.round(len/2 - 2*blockSize); - } - } - - let sorted = Object.keys(charsets).map(function(key) { - return { codePage: codePage[key], c: charsets[key] }; - }); - - sorted.sort((a, b) => b.c - a.c); - - return sorted[0].codePage; - } - -module.exports = getEncoding; \ No newline at end of file diff --git a/server/core/BookConverter/index.js b/server/core/BookConverter/index.js index ca710bad..1a6adaf9 100644 --- a/server/core/BookConverter/index.js +++ b/server/core/BookConverter/index.js @@ -4,7 +4,7 @@ const iconv = require('iconv-lite'); const chardet = require('chardet'); const _ = require('lodash'); const sax = require('./sax'); -const getEncoding = require('./getEncoding'); +const textUtils = require('./textUtils'); const FileDetector = require('../FileDetector'); @@ -18,9 +18,10 @@ class BookConverter { async convertToFb2(inputFile, outputFile, url, callback) { const fileType = await this.detector.detectFile(inputFile); - if (fileType && (fileType.ext == 'html' || fileType.ext == 'xml')) { - const data = await fs.readFile(inputFile); + const data = await fs.readFile(inputFile); + callback(100); + if (fileType && (fileType.ext == 'html' || fileType.ext == 'xml')) { if (data.toString().indexOf('= 0) { await fs.writeFile(outputFile, data); return; @@ -34,12 +35,19 @@ class BookConverter { } await fs.writeFile(outputFile, this.convertHtml(data)); - callback(100); + return; } else { if (fileType) - throw new Error(`unknown file format: ${fileType.mime}`); - else - throw new Error(`unsupported file format: ${url}`); + throw new Error(`Этот формат файла не поддерживается: ${fileType.mime}`); + else { + //может это чистый текст? + if (textUtils.checkIfText(data)) { + await fs.writeFile(outputFile, this.convertHtml(data)); + return; + } + + throw new Error(`Не удалось определить формат файла: ${url}`); + } } } @@ -55,7 +63,7 @@ class BookConverter { } if (selected == 'ISO-8859-5') { - selected = getEncoding(data); + selected = textUtils.getEncoding(data); } return iconv.decode(data, selected); diff --git a/server/core/BookConverter/textUtils.js b/server/core/BookConverter/textUtils.js new file mode 100644 index 00000000..2471c1ec --- /dev/null +++ b/server/core/BookConverter/textUtils.js @@ -0,0 +1,82 @@ +function getEncoding(buf) { + const lowerCase = 3; + const upperCase = 1; + + const codePage = { + 'k': 'koi8-r', + 'w': 'Windows-1251', + 'd': 'cp866', + 'i': 'ISO-8859-5', + 'm': 'maccyrillic', + }; + + let charsets = { + 'k': 0, + 'w': 0, + 'd': 0, + 'i': 0, + 'm': 0 + }; + + const len = buf.length; + const blockSize = (len > 5*3000 ? 3000 : len); + let counter = 0; + let i = 0; + while (i < len) { + const char = buf[i]; + i++; + //non-russian characters + if (char < 128 || char > 256) + continue; + //CP866 + if ((char > 159 && char < 176) || (char > 223 && char < 242)) charsets['d'] += lowerCase; + if ((char > 127 && char < 160)) charsets['d'] += upperCase; + + //KOI8-R + if ((char > 191 && char < 223)) charsets['k'] += lowerCase; + if ((char > 222 && char < 256)) charsets['k'] += upperCase; + + //WIN-1251 + if (char > 223 && char < 256) charsets['w'] += lowerCase; + if (char > 191 && char < 224) charsets['w'] += upperCase; + + //MAC + if (char > 221 && char < 255) charsets['m'] += lowerCase; + if (char > 127 && char < 160) charsets['m'] += upperCase; + + //ISO-8859-5 + if (char > 207 && char < 240) charsets['i'] += lowerCase; + if (char > 175 && char < 208) charsets['i'] += upperCase; + + counter++; + + if (counter > blockSize) { + counter = 0; + i += Math.round(len/2 - 2*blockSize); + } + } + + let sorted = Object.keys(charsets).map(function(key) { + return { codePage: codePage[key], c: charsets[key] }; + }); + + sorted.sort((a, b) => b.c - a.c); + + return sorted[0].codePage; +} + +function checkIfText(buf) { + let spaceCount = 0; + for (let i = 0; i < buf.length; i++) { + if (buf[i] == 32) + spaceCount++; + } + const freq = spaceCount/(buf.length + 1); + + return (freq > 0.1); +} + +module.exports = { + getEncoding, + checkIfText, +} \ No newline at end of file