Улучшено определение кодировки и текстового файла

This commit is contained in:
Book Pauk
2019-02-23 19:09:57 +07:00
parent 3e2f01d56d
commit aeadb5aeb8
2 changed files with 39 additions and 20 deletions

View File

@@ -66,7 +66,10 @@ class BookConverter {
} }
} }
return iconv.decode(data, selected); if (selected.toLowerCase() != 'utf-8')
return iconv.decode(data, selected);
else
return data;
} }
checkEncoding(data) { checkEncoding(data) {

View File

@@ -1,4 +1,4 @@
function getEncoding(buf) { function getEncoding(buf, returnAll) {
const lowerCase = 3; const lowerCase = 3;
const upperCase = 1; const upperCase = 1;
@@ -8,6 +8,7 @@ function getEncoding(buf) {
'd': 'cp866', 'd': 'cp866',
'i': 'ISO-8859-5', 'i': 'ISO-8859-5',
'm': 'maccyrillic', 'm': 'maccyrillic',
'u': 'utf-8',
}; };
let charsets = { let charsets = {
@@ -15,38 +16,47 @@ function getEncoding(buf) {
'w': 0, 'w': 0,
'd': 0, 'd': 0,
'i': 0, 'i': 0,
'm': 0 'm': 0,
'u': 0,
}; };
const len = buf.length; const len = buf.length;
const blockSize = (len > 5*3000 ? 3000 : len); const blockSize = (len > 5*3000 ? 3000 : len);
let counter = 0; let counter = 0;
let i = 0; let i = 0;
let totalChecked = 0;
while (i < len) { while (i < len) {
const char = buf[i]; const char = buf[i];
const nextChar = (i < len - 1 ? buf[i + 1] : 0);
totalChecked++;
i++; i++;
//non-russian characters //non-russian characters
if (char < 128 || char > 256) if (char < 128 || char > 256)
continue; continue;
//CP866 //UTF-8
if ((char > 159 && char < 176) || (char > 223 && char < 242)) charsets['d'] += lowerCase; if ((char == 208 || char == 209) && nextChar >= 128 && nextChar <= 190)
if ((char > 127 && char < 160)) charsets['d'] += upperCase; charsets['u'] += lowerCase;
else {
//CP866
if ((char > 159 && char < 176) || (char > 223 && char < 242)) charsets['d'] += lowerCase;
if ((char > 127 && char < 160)) charsets['d'] += upperCase;
//KOI8-R //KOI8-R
if ((char > 191 && char < 223)) charsets['k'] += lowerCase; if ((char > 191 && char < 223)) charsets['k'] += lowerCase;
if ((char > 222 && char < 256)) charsets['k'] += upperCase; if ((char > 222 && char < 256)) charsets['k'] += upperCase;
//WIN-1251 //WIN-1251
if (char > 223 && char < 256) charsets['w'] += lowerCase; if (char > 223 && char < 256) charsets['w'] += lowerCase;
if (char > 191 && char < 224) charsets['w'] += upperCase; if (char > 191 && char < 224) charsets['w'] += upperCase;
//MAC //MAC
if (char > 221 && char < 255) charsets['m'] += lowerCase; if (char > 221 && char < 255) charsets['m'] += lowerCase;
if (char > 127 && char < 160) charsets['m'] += upperCase; if (char > 127 && char < 160) charsets['m'] += upperCase;
//ISO-8859-5 //ISO-8859-5
if (char > 207 && char < 240) charsets['i'] += lowerCase; if (char > 207 && char < 240) charsets['i'] += lowerCase;
if (char > 175 && char < 208) charsets['i'] += upperCase; if (char > 175 && char < 208) charsets['i'] += upperCase;
}
counter++; counter++;
@@ -57,18 +67,24 @@ function getEncoding(buf) {
} }
let sorted = Object.keys(charsets).map(function(key) { let sorted = Object.keys(charsets).map(function(key) {
return { codePage: codePage[key], c: charsets[key] }; return { codePage: codePage[key], c: charsets[key], totalChecked };
}); });
sorted.sort((a, b) => b.c - a.c); sorted.sort((a, b) => b.c - a.c);
if (sorted[0].c > 0) if (returnAll)
return sorted;
else if (sorted[0].c > 0)
return sorted[0].codePage; return sorted[0].codePage;
else else
return 'ISO-8859-5'; return 'ISO-8859-5';
} }
function checkIfText(buf) { function checkIfText(buf) {
const enc = getEncoding(buf, true);
if (enc[0].c > enc[0].totalChecked*0.9)
return true;
let spaceCount = 0; let spaceCount = 0;
let crCount = 0; let crCount = 0;
let lfCount = 0; let lfCount = 0;