Улучшил поддержку текстовых файлов

This commit is contained in:
Book Pauk
2019-02-12 22:52:12 +07:00
parent 7dab3bfb1e
commit 9c20df510d
3 changed files with 98 additions and 76 deletions

View File

@@ -1,68 +0,0 @@
function getEncoding(buf) {
const lowerCase = 3;
const upperCase = 1;
const codePage = {
'k': 'koi8-r',
'w': 'Windows-1251',
'd': 'cp866',
'i': 'ISO-8859-5',
'm': 'maccyrillic',
};
let charsets = {
'k': 0,
'w': 0,
'd': 0,
'i': 0,
'm': 0
};
const len = buf.length;
const blockSize = (len > 5*3000 ? 3000 : len);
let counter = 0;
let i = 0;
while (i < len) {
const char = buf[i];
i++;
//non-russian characters
if (char < 128 || char > 256)
continue;
//CP866
if ((char > 159 && char < 176) || (char > 223 && char < 242)) charsets['d'] += lowerCase;
if ((char > 127 && char < 160)) charsets['d'] += upperCase;
//KOI8-R
if ((char > 191 && char < 223)) charsets['k'] += lowerCase;
if ((char > 222 && char < 256)) charsets['k'] += upperCase;
//WIN-1251
if (char > 223 && char < 256) charsets['w'] += lowerCase;
if (char > 191 && char < 224) charsets['w'] += upperCase;
//MAC
if (char > 221 && char < 255) charsets['m'] += lowerCase;
if (char > 127 && char < 160) charsets['m'] += upperCase;
//ISO-8859-5
if (char > 207 && char < 240) charsets['i'] += lowerCase;
if (char > 175 && char < 208) charsets['i'] += upperCase;
counter++;
if (counter > blockSize) {
counter = 0;
i += Math.round(len/2 - 2*blockSize);
}
}
let sorted = Object.keys(charsets).map(function(key) {
return { codePage: codePage[key], c: charsets[key] };
});
sorted.sort((a, b) => b.c - a.c);
return sorted[0].codePage;
}
module.exports = getEncoding;

View File

@@ -4,7 +4,7 @@ const iconv = require('iconv-lite');
const chardet = require('chardet');
const _ = require('lodash');
const sax = require('./sax');
const getEncoding = require('./getEncoding');
const textUtils = require('./textUtils');
const FileDetector = require('../FileDetector');
@@ -18,9 +18,10 @@ class BookConverter {
async convertToFb2(inputFile, outputFile, url, callback) {
const fileType = await this.detector.detectFile(inputFile);
if (fileType && (fileType.ext == 'html' || fileType.ext == 'xml')) {
const data = await fs.readFile(inputFile);
const data = await fs.readFile(inputFile);
callback(100);
if (fileType && (fileType.ext == 'html' || fileType.ext == 'xml')) {
if (data.toString().indexOf('<FictionBook') >= 0) {
await fs.writeFile(outputFile, data);
return;
@@ -34,12 +35,19 @@ class BookConverter {
}
await fs.writeFile(outputFile, this.convertHtml(data));
callback(100);
return;
} else {
if (fileType)
throw new Error(`unknown file format: ${fileType.mime}`);
else
throw new Error(`unsupported file format: ${url}`);
throw new Error(`Этот формат файла не поддерживается: ${fileType.mime}`);
else {
//может это чистый текст?
if (textUtils.checkIfText(data)) {
await fs.writeFile(outputFile, this.convertHtml(data));
return;
}
throw new Error(`Не удалось определить формат файла: ${url}`);
}
}
}
@@ -55,7 +63,7 @@ class BookConverter {
}
if (selected == 'ISO-8859-5') {
selected = getEncoding(data);
selected = textUtils.getEncoding(data);
}
return iconv.decode(data, selected);

View File

@@ -0,0 +1,82 @@
function getEncoding(buf) {
const lowerCase = 3;
const upperCase = 1;
const codePage = {
'k': 'koi8-r',
'w': 'Windows-1251',
'd': 'cp866',
'i': 'ISO-8859-5',
'm': 'maccyrillic',
};
let charsets = {
'k': 0,
'w': 0,
'd': 0,
'i': 0,
'm': 0
};
const len = buf.length;
const blockSize = (len > 5*3000 ? 3000 : len);
let counter = 0;
let i = 0;
while (i < len) {
const char = buf[i];
i++;
//non-russian characters
if (char < 128 || char > 256)
continue;
//CP866
if ((char > 159 && char < 176) || (char > 223 && char < 242)) charsets['d'] += lowerCase;
if ((char > 127 && char < 160)) charsets['d'] += upperCase;
//KOI8-R
if ((char > 191 && char < 223)) charsets['k'] += lowerCase;
if ((char > 222 && char < 256)) charsets['k'] += upperCase;
//WIN-1251
if (char > 223 && char < 256) charsets['w'] += lowerCase;
if (char > 191 && char < 224) charsets['w'] += upperCase;
//MAC
if (char > 221 && char < 255) charsets['m'] += lowerCase;
if (char > 127 && char < 160) charsets['m'] += upperCase;
//ISO-8859-5
if (char > 207 && char < 240) charsets['i'] += lowerCase;
if (char > 175 && char < 208) charsets['i'] += upperCase;
counter++;
if (counter > blockSize) {
counter = 0;
i += Math.round(len/2 - 2*blockSize);
}
}
let sorted = Object.keys(charsets).map(function(key) {
return { codePage: codePage[key], c: charsets[key] };
});
sorted.sort((a, b) => b.c - a.c);
return sorted[0].codePage;
}
function checkIfText(buf) {
let spaceCount = 0;
for (let i = 0; i < buf.length; i++) {
if (buf[i] == 32)
spaceCount++;
}
const freq = spaceCount/(buf.length + 1);
return (freq > 0.1);
}
module.exports = {
getEncoding,
checkIfText,
}