Files
liberama/docs/omnireader.ru/old/f.php

487 lines
14 KiB
PHP

<?php
date_default_timezone_set('Europe/Moscow');
require_once 'config/config.php';
require_once 'parser.php';
define('LOWERCASE',3);
define('UPPERCASE',1);
function getParam($param, $defaultValue = '') {
$paramValue = (isset($_REQUEST[$param]) ? $_REQUEST[$param] : $defaultValue);
return $paramValue;
}
function getEncoding($str, $check_utf = FALSE) {
if (!$check_utf) {
$result = getEncoding(mb_convert_encoding($str, 'cp1251', 'UTF-8'), TRUE);
if ($result == 'w')
return 'u';
}
$charsets = Array(
'k' => 0,
'w' => 0,
'd' => 0,
'i' => 0,
'm' => 0
);
$length = strlen($str);
$block_size = ($length > 5*3000) ? 3000 : $length;
$counter = 0;
for ( $i = 0; $i < $length; $i++ ) {
$char = ord($str[$i]);
//non-russian characters
if ($char < 128 || $char > 256)
continue;
//CP866
if (($char > 159 && $char < 176) || ($char > 223 && $char < 242)) $charsets['d']+=LOWERCASE;
if (($char > 127 && $char < 160)) $charsets['d']+=UPPERCASE;
//KOI8-R
if (($char > 191 && $char < 223)) $charsets['k']+=LOWERCASE;
if (($char > 222 && $char < 256)) $charsets['k']+=UPPERCASE;
//WIN-1251
if ($char > 223 && $char < 256) $charsets['w']+=LOWERCASE;
if ($char > 191 && $char < 224) $charsets['w']+=UPPERCASE;
//MAC
if ($char > 221 && $char < 255) $charsets['m']+=LOWERCASE;
if ($char > 127 && $char < 160) $charsets['m']+=UPPERCASE;
//ISO-8859-5
if ($char > 207 && $char < 240) $charsets['i']+=LOWERCASE;
if ($char > 175 && $char < 208) $charsets['i']+=UPPERCASE;
$counter++;
if ($counter > $block_size) {
$counter = 0;
$i += (int)($length/2 - 2*$block_size);
}
}
arsort($charsets);
if (preg_match('//u', $str))
return 'u';
else
return key($charsets);
}
function getTag($tagName, $book) {
$from_tag = '<' . $tagName . '>';
$to_tag = '</' . $tagName . '>';
$from = strpos($book, $from_tag);
$to = strpos($book, $to_tag);
if ($from === FALSE || $to === FALSE)
return '';
$from += strlen($from_tag);
return trim(substr($book, $from, $to - $from));
}
function getMetaInfoAndFilter($book, &$meta_info) {
$meta_info['author'] = '';
$meta_info['title'] = getTag('title', $book);
$out = $book;
//fb2 ??? ---------------------
if (strpos($meta_info['title'], '<p>') !== FALSE) {
$s = str_replace('</p>', '', $meta_info['title']);
$a = explode('<p>', $s);
$meta_info['author'] = parseHtml($a[1], TRUE);
$meta_info['title'] = parseHtml($a[2], TRUE);
if ($meta_info['title'] === NULL || $meta_info['title'] === '') {
$s = parseHtml($s, TRUE);
$meta_info['author'] = '';
$meta_info['title'] = $s;
}
}
//samlib ----------------------
$samlib_start_sign = '<!----------- Ñîáñòâåííî ïðîèçâåäåíèå --------------->';
$samlib_book_idx = strpos($book, $samlib_start_sign);
if ($samlib_book_idx !== FALSE) {
$samlib_author = getTag('h3', $book);
$meta_info['author'] = substr($samlib_author, 0, strpos($samlib_author, ': <small>'));
$meta_info['title'] = getTag('h2', $book);;
$samlib_book_idx += strlen($samlib_start_sign);
$samlib_book_end_idx = strpos($book, '<!---- Áëîê îïèñàíèÿ ïðîèçâåäåíèÿ (ñëåâà âíèçó) ----------------------->');
$samlib_book_end_idx = ($samlib_book_end_idx === FALSE ? strlen($book) : $samlib_book_end_idx);
$out = '<dd>' . $meta_info['author'] . '<dd>' . $meta_info['title'] . '<empty-line/>' .
substr($book, $samlib_book_idx, $samlib_book_end_idx - $samlib_book_idx);
$out = preg_replace("/<dd>&nbsp;&nbsp[;]*\s*[\r\n]/", '<empty-line/>', $out);
}
return $out;
}
function filterTextAndGzip($meta_info, $txtin) {
global $use_gzip;
if (strpos($txtin, '<P>') === FALSE) {
$len = strlen($txtin);
$counts = array();
$flag = 0;
$c = 0;
for ($i = 0; $i < $len; $i++) {
if ($txtin[$i] == chr(10) || $i == 0) {
$counts[$c]++;
if ($c > 0)
$counts[0]++;
$c = 0;
$flag = 1;
} else
if ($txtin[$i] != ' ')
$flag = 0;
else
if ($flag)
$c++;
}
arsort($counts);
$key = 0;
if (count($counts) > 1) {
next($counts);
$key = key($counts);
}
//$txtout .= print_r($counts, TRUE);
//$txtout .= $key;
$txtout = '';
$flag = 0;
$c = 0;
for ($i = 0; $i < $len; $i++) {
if ($txtin[$i] == chr(10) || $i == 0) {
$c = 0;
$flag = 1;
} else
if ($txtin[$i] != ' ') {
if ($c >= $key && $flag)
$txtout .= '<p>';
$flag = 0;
}
else
if ($flag)
$c++;
$txtout .= $txtin[$i];
}
} else
$txtout = $txtin;
$txtout = 'no_file' . '|' . $meta_info['author'] . '|' . $meta_info['title'] .
'<<<bpr5A432688AB0467AA396E5A144830248Abpr>>>' . $txtout;
$supportsGzip = strpos($_SERVER['HTTP_ACCEPT_ENCODING'], 'gzip') !== false;
if ($use_gzip && $supportsGzip && getParam('meta') == '' && getParam('curl') == '') {
$txtout = gzencode($txtout, 9);
header('Content-Encoding: gzip');
}
return $txtout;
}
function myErrorHandler($errno, $errstr, $errfile, $errline)
{
if (!(error_reporting() & $errno)) {
// Ýòîò êîä îøèáêè íå âêëþ÷åí â error_reporting
return;
}
if ($errno == 8 /*|| $errno == 2*/)
return;
//throw new Exception("[$errno]: ($errstr) at $errfile line $errline");
throw new Exception("$errstr");
// Íå çàïóñêàåì âíóòðåííèé îáðàáîò÷èê îøèáîê PHP
return TRUE; // ñþäà õîäà íåò, íî ïóñòü áóäåò êàê øàáëîí
}
function unzip($filein) {
$zip = new ZipArchive;
$result = '';
if ($zip->open($filein) === TRUE) {
$filename = '';
$max_size = -1;
for($i = 0; $i < $zip->numFiles; $i++) {
$stat = $zip->statIndex($i);
$size = $stat['size'];
if ($size > $max_size) {
$max_size = $size;
$filename = $zip->getNameIndex($i);
$fp = $zip->getStream($filename);
if (!$fp)
throw new Exception("zip->getStream failed");
$result = stream_get_contents($fp);
fclose($fp);
}
}
$zip->close();
} else
throw new Exception("zip->open failed");
return $result;
}
function create_guid($namespace = '') {
$uid = md5(uniqid("", true));
$data = $namespace;
$data .= $_SERVER['REQUEST_TIME'];
$data .= $_SERVER['HTTP_USER_AGENT'];
$data .= $_SERVER['LOCAL_ADDR'];
$data .= $_SERVER['LOCAL_PORT'];
$data .= $_SERVER['REMOTE_ADDR'];
$data .= $_SERVER['REMOTE_PORT'];
$hash = strtoupper(hash('ripemd128', $uid . $guid . md5($data)));
return $hash;
}
function microtime_float()
{
list($usec, $sec) = explode(" ", microtime());
return ((float)$usec + (float)$sec);
}
function curlExec(/* Array */$curlOptions='', /* Array */$curlHeaders='', /* Array */$postFields='')
{
$newUrl = '';
$maxRedirection = 10;
do
{
if ($maxRedirection<1) die('Error: reached the limit of redirections');
$ch = curl_init();
curl_setopt($ch, CURLOPT_SSL_VERIFYHOST, false);
curl_setopt($ch, CURLOPT_SSL_VERIFYPEER, false);
if (!empty($curlOptions)) curl_setopt_array($ch, $curlOptions);
if (!empty($curlHeaders)) curl_setopt($ch, CURLOPT_HTTPHEADER, $curlHeaders);
if (!empty($postFields))
{
curl_setopt($ch, CURLOPT_POST, 1);
curl_setopt($ch, CURLOPT_POSTFIELDS, $postFields);
}
if (!empty($newUrl)) curl_setopt($ch, CURLOPT_URL, $newUrl); // redirect needed
curl_setopt($ch, CURLOPT_HEADER, 1);
$response = curl_exec($ch);
// Then, after your curl_exec call:
$header_size = curl_getinfo($ch, CURLINFO_HEADER_SIZE);
$header = substr($response, 0, $header_size);
$curlResult = substr($response, $header_size);
$code = curl_getinfo($ch, CURLINFO_HTTP_CODE);
$info = curl_getinfo($ch);
if (getParam('curl') != '') {;
throw new Exception("<br>" . str_replace("[", "<br>[", print_r($info, TRUE)) . "<br>$header<br>END");
}
if ($code == 301 || $code == 302 || $code == 303 || $code == 307)
{
if (array_key_exists('redirect_url', $info) && !empty($info['redirect_url'])) {
$newUrl = trim($info['redirect_url']);
} else {
preg_match('/Location:(.*?)\n/', $header, $matches);
$newUrl = trim(array_pop($matches));
}
curl_close($ch);
$maxRedirection--;
continue;
}
else // no more redirection
{
if ($curlResult === FALSE || $info['http_code'] != 200) {
$curlResult = "ERROR ". $info['http_code'];
if (curl_error($ch))
$curlResult .= "<br>". curl_error($ch);
throw new Exception($curlResult);
} else {
$code = 0; //OK
curl_close($ch);
}
}
}
while($code);
return $curlResult;
}
function tryNewApi($curlOptions, $url) {
try {
$api = 'http://127.0.0.1:44081/api/';
$host = 'http://127.0.0.1';
$curlOptions[CURLOPT_URL] = $api . 'reader/load-book';
$out = curlExec($curlOptions, array('Content-type: application/json'), "{\"url\": \"$url\"}");
$out = json_decode($out, true);
if (!$out)
return false;
$workerId = $out['workerId'];
$i = 0;
while ($out['state'] != 'finish') {
usleep(500*1000);
$curlOptions[CURLOPT_URL] = $api . 'worker/get-state';
$out = curlExec($curlOptions, array('Content-type: application/json'), "{\"workerId\": \"$workerId\"}");
$out = json_decode($out, true);
if (!$out || $i > 250)
return false;
$i++;
}
$path = $out['path'];
$curlOptions[CURLOPT_URL] = $host . $path;
$out = curlExec($curlOptions);
$out = gzdecode($out);
return $out;
} catch (Exception $e) {
return false;
}
}
{
set_error_handler("myErrorHandler");
// set_time_limit(300);
$url = getParam('url');
try {
$body = '';
if ($url == '')
throw new Exception("íå çàäàí àäðåñ êíèãè");
$meta_info = array();
$time_start = $time = microtime_float();
$pid = create_guid();
$dir = 'txt/';
$encoding = getParam('encoding');
if (strpos($url, 'http://') !== 0 && strpos($url, 'https://') !== 0)
$url = 'http://' . $url;
$url = str_replace('"', '', $url);
$url = str_replace('\'', '', $url);
$url = str_replace(']', '%5D', str_replace('[', '%5B', $url));
$options = array(
CURLOPT_RETURNTRANSFER => TRUE,
CURLOPT_TIMEOUT => 300,
CURLOPT_BUFFERSIZE => 1024*128,
CURLOPT_NOPROGRESS => FALSE,
CURLOPT_USERAGENT => "Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1.6) Gecko/20070725 Firefox/2.0.0.6",
CURLOPT_PROGRESSFUNCTION => function(
$DownloadSize, $Downloaded, $UploadSize, $Uploaded
) {
// If $Downloaded exceeds, returning non-0 breaks the connection!
return ($Downloaded > (50 * 1024 * 1024)) ? 1 : 0;
}
);
$out = tryNewApi($options, $url);
if (!$out) {
$options[CURLOPT_URL] = $url;
$out = curlExec($options);
}
$meta_info['time_curl'] = microtime_float() - $time;
$time = microtime_float();
//zip
if ($out[0] == chr(0x50) && $out[1] == chr(0x4B) && $out[2] == chr(0x03) && $out[3] == chr(0x04)) {
$zipped_file = $tmp_dir . "/{$pid}-temp.zip";
file_put_contents($zipped_file, $out);
$out = unzip($zipped_file);
if (file_exists($zipped_file)) unlink($zipped_file);
}
//pdf
/* if ($out[0] == chr(0x25) && $out[1] == chr(0x50) && $out[2] == chr(0x44) && $out[3] == chr(0x46)) {
$a = new PDF2Text();
$a->reset();
$a->decodePDF($out);
$out = $a->output();
file_put_contents('/tmp/1', $out);
}*/
$meta_info['time_unzip'] = microtime_float() - $time;
$time = microtime_float();
//decoding and parsing
if ($out !== FALSE) {
if ($encoding == '')
$encoding = getEncoding($out);
switch ($encoding) {
case 'k':
$out = mb_convert_encoding($out, 'cp1251', 'KOI8-R');
break;
case 'w':
break;
case 'd':
$out = mb_convert_encoding($out, 'cp1251', 'cp866');
break;
case 'i':
$out = mb_convert_encoding($out, 'cp1251', 'ISO-8859-5');
break;
case 'm':
$out = mb_convert_encoding($out, 'cp1251', 'MACINTOSH');
break;
case 'u':
$out = mb_convert_encoding($out, 'cp1251', 'UTF-8');
break;
}
//$out = $encoding . '===' . $out;
//file_put_contents('/tmp/bpr1', $out);
$meta_info['time_decodepage'] = microtime_float() - $time;
$time = microtime_float();
$out = getMetaInfoAndFilter($out, $meta_info);
$meta_info['time_metainfo'] = microtime_float() - $time;
$time = microtime_float();
$out = parseHtml($out);
$meta_info['time_parsehtml'] = microtime_float() - $time;
$time = microtime_float();
$out = filterTextAndGzip($meta_info, $out);
$meta_info['time_filter_gzip'] = microtime_float() - $time;
$meta_info['time_total'] = microtime_float() - $time_start;
$meta = getParam('meta');
if ($meta != '') {
$info = '';
foreach ($meta_info as $key => $value) {
if (strpos($key, 'time') !== FALSE)
$info .= sprintf("%06.3f", $value) . " $key <br>";
else
$info .= "$key: $value<br>";
}
throw new Exception("<br>" . $info);
}
header('Content-Type: text/plain; charset=windows-1251');
echo $out;
//file_put_contents('/tmp/bpr2', $out);
return;
} else
throw new Exception("îøèáêà çàãðóçêè ôàéëà. Ïîïðîáóéòå åùå ðàç.");
} catch (Exception $e) {
header('Content-Type: text/html; charset=windows-1251');
$err = $e->getMessage();
if (strpos($err, 'ERROR 404') !== FALSE)
$err = 'ñòðàíèöà íå íàéäåíà';
$body = "Îøèáêà çàãðóçêè êíèãè: " . ($url == '' ? '' : "($url) ") . $err;
}
echo $body;
}
?>