Улучшено определение кодировки

This commit is contained in:
Book Pauk
2023-03-17 13:39:50 +07:00
parent bf6cf4238a
commit 4b4f7bd697
2 changed files with 3 additions and 12 deletions

View File

@@ -35,7 +35,7 @@ class Fb2Helper {
if (m) { if (m) {
let enc = m[1].toLowerCase(); let enc = m[1].toLowerCase();
if (enc != 'utf-8') { if (enc != 'utf-8') {
//enc может не соответсвовать реальной кодировке файла, поэтому: //если кодировка не определена в getEncoding, используем enc
if (encoding.indexOf('ISO-8859') >= 0) { if (encoding.indexOf('ISO-8859') >= 0) {
encoding = enc; encoding = enc;
} }

View File

@@ -4,7 +4,7 @@ function getEncoding(buf) {
let selected = getEncodingLite(buf); let selected = getEncodingLite(buf);
if (selected == 'ISO-8859-5' && buf.length > 10) { if (selected == 'ISO-8859-5' && buf.length > 10) {
const charsetAll = chardet.analyse(buf.slice(0, 20000)); const charsetAll = chardet.analyse(buf.slice(0, 100000));
for (const charset of charsetAll) { for (const charset of charsetAll) {
if (charset.name.indexOf('ISO-8859') < 0) { if (charset.name.indexOf('ISO-8859') < 0) {
selected = charset.name; selected = charset.name;
@@ -39,9 +39,7 @@ function getEncodingLite(buf, returnAll) {
'u': 0, 'u': 0,
}; };
const len = buf.length; const len = (buf.length > 100000 ? 100000 : buf.length);
const blockSize = (len > 5*3000 ? 3000 : len);
let counter = 0;
let i = 0; let i = 0;
let totalChecked = 0; let totalChecked = 0;
while (i < len) { while (i < len) {
@@ -76,13 +74,6 @@ function getEncodingLite(buf, returnAll) {
if (char > 207 && char < 240) charsets['i'] += lowerCase; if (char > 207 && char < 240) charsets['i'] += lowerCase;
if (char > 175 && char < 208) charsets['i'] += upperCase; if (char > 175 && char < 208) charsets['i'] += upperCase;
} }
counter++;
if (counter > blockSize) {
counter = 0;
i += Math.round(len/2 - 2*blockSize);
}
} }
let sorted = Object.keys(charsets).map(function(key) { let sorted = Object.keys(charsets).map(function(key) {