From aeadb5aeb89b6bd4c5030ba7cf643b61842f2833 Mon Sep 17 00:00:00 2001 From: Book Pauk Date: Sat, 23 Feb 2019 19:09:57 +0700 Subject: [PATCH] =?UTF-8?q?=D0=A3=D0=BB=D1=83=D1=87=D1=88=D0=B5=D0=BD?= =?UTF-8?q?=D0=BE=20=D0=BE=D0=BF=D1=80=D0=B5=D0=B4=D0=B5=D0=BB=D0=B5=D0=BD?= =?UTF-8?q?=D0=B8=D0=B5=20=D0=BA=D0=BE=D0=B4=D0=B8=D1=80=D0=BE=D0=B2=D0=BA?= =?UTF-8?q?=D0=B8=20=D0=B8=20=D1=82=D0=B5=D0=BA=D1=81=D1=82=D0=BE=D0=B2?= =?UTF-8?q?=D0=BE=D0=B3=D0=BE=20=D1=84=D0=B0=D0=B9=D0=BB=D0=B0?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- server/core/BookConverter/index.js | 5 ++- server/core/BookConverter/textUtils.js | 54 +++++++++++++++++--------- 2 files changed, 39 insertions(+), 20 deletions(-) diff --git a/server/core/BookConverter/index.js b/server/core/BookConverter/index.js index 93865f30..afa311e6 100644 --- a/server/core/BookConverter/index.js +++ b/server/core/BookConverter/index.js @@ -66,7 +66,10 @@ class BookConverter { } } - return iconv.decode(data, selected); + if (selected.toLowerCase() != 'utf-8') + return iconv.decode(data, selected); + else + return data; } checkEncoding(data) { diff --git a/server/core/BookConverter/textUtils.js b/server/core/BookConverter/textUtils.js index 475ef572..6ee5169d 100644 --- a/server/core/BookConverter/textUtils.js +++ b/server/core/BookConverter/textUtils.js @@ -1,4 +1,4 @@ -function getEncoding(buf) { +function getEncoding(buf, returnAll) { const lowerCase = 3; const upperCase = 1; @@ -8,6 +8,7 @@ function getEncoding(buf) { 'd': 'cp866', 'i': 'ISO-8859-5', 'm': 'maccyrillic', + 'u': 'utf-8', }; let charsets = { @@ -15,38 +16,47 @@ function getEncoding(buf) { 'w': 0, 'd': 0, 'i': 0, - 'm': 0 + 'm': 0, + 'u': 0, }; const len = buf.length; const blockSize = (len > 5*3000 ? 3000 : len); let counter = 0; let i = 0; + let totalChecked = 0; while (i < len) { const char = buf[i]; + const nextChar = (i < len - 1 ? buf[i + 1] : 0); + totalChecked++; i++; //non-russian characters if (char < 128 || char > 256) continue; - //CP866 - if ((char > 159 && char < 176) || (char > 223 && char < 242)) charsets['d'] += lowerCase; - if ((char > 127 && char < 160)) charsets['d'] += upperCase; + //UTF-8 + if ((char == 208 || char == 209) && nextChar >= 128 && nextChar <= 190) + charsets['u'] += lowerCase; + else { + //CP866 + if ((char > 159 && char < 176) || (char > 223 && char < 242)) charsets['d'] += lowerCase; + if ((char > 127 && char < 160)) charsets['d'] += upperCase; - //KOI8-R - if ((char > 191 && char < 223)) charsets['k'] += lowerCase; - if ((char > 222 && char < 256)) charsets['k'] += upperCase; + //KOI8-R + if ((char > 191 && char < 223)) charsets['k'] += lowerCase; + if ((char > 222 && char < 256)) charsets['k'] += upperCase; - //WIN-1251 - if (char > 223 && char < 256) charsets['w'] += lowerCase; - if (char > 191 && char < 224) charsets['w'] += upperCase; + //WIN-1251 + if (char > 223 && char < 256) charsets['w'] += lowerCase; + if (char > 191 && char < 224) charsets['w'] += upperCase; - //MAC - if (char > 221 && char < 255) charsets['m'] += lowerCase; - if (char > 127 && char < 160) charsets['m'] += upperCase; + //MAC + if (char > 221 && char < 255) charsets['m'] += lowerCase; + if (char > 127 && char < 160) charsets['m'] += upperCase; - //ISO-8859-5 - if (char > 207 && char < 240) charsets['i'] += lowerCase; - if (char > 175 && char < 208) charsets['i'] += upperCase; + //ISO-8859-5 + if (char > 207 && char < 240) charsets['i'] += lowerCase; + if (char > 175 && char < 208) charsets['i'] += upperCase; + } counter++; @@ -57,18 +67,24 @@ function getEncoding(buf) { } let sorted = Object.keys(charsets).map(function(key) { - return { codePage: codePage[key], c: charsets[key] }; + return { codePage: codePage[key], c: charsets[key], totalChecked }; }); sorted.sort((a, b) => b.c - a.c); - if (sorted[0].c > 0) + if (returnAll) + return sorted; + else if (sorted[0].c > 0) return sorted[0].codePage; else return 'ISO-8859-5'; } function checkIfText(buf) { + const enc = getEncoding(buf, true); + if (enc[0].c > enc[0].totalChecked*0.9) + return true; + let spaceCount = 0; let crCount = 0; let lfCount = 0;