.
*/
/**
* @defgroup full_text Full-text Module
*/
/**
* Full-text is a Maarch module which allows you to make full text indexing with the Lucene engine.
* We use PHP version of Lucene integrated into the ZEND framework.
* This Maarch module proposes a batch allowing the full text indexing.
* This batch is launched for each collection of Maarch and works on Linux or Windows OS.
* It course a resources table and brings out documents candidates for full text.
* A user exit code is stored in fulltext_result column of the document in "res_x" :
*
* - 1 : Full Text extraction successfull
* - -1 : No file found
* - -2 : File extension not allowed for lucene
* - 2 : no result for this extraction
*
* @file
* @author Mathieu Donzel
* @author Laurent Giovannoni
* @date $date$
* @version $Revision$
* @ingroup full_text
* @brief Extraction of information on PDF with lucene functions of Zend Framework
*/
//error mode and function
error_reporting(E_ERROR);
set_error_handler(errorHandler);
// global vars of the program
/**
* Path to the log file
*/
$_ENV['log'] = "";
/**
* User exit of the program, contains 1 if any problem appears
*/
$_ENV['ErrorLevel'] = 0;
/**
* Connection object to database 1
*/
$_ENV['db'] = "";
/**
* Connection object to database 2
*/
$_ENV['db2'] = "";
/**
* Creation of the log file
*/
function loginCreation()
{
if(!is_dir(dirname($_SERVER["PHP_SELF"]).DIRECTORY_SEPARATOR."log".DIRECTORY_SEPARATOR))
{
mkdir(dirname($_SERVER["PHP_SELF"]).DIRECTORY_SEPARATOR."log".DIRECTORY_SEPARATOR."",0777);
}
if(!is_dir(dirname($_SERVER["PHP_SELF"]).DIRECTORY_SEPARATOR."log".DIRECTORY_SEPARATOR."".date("Y")."_".date("m")."_".date("d")."".DIRECTORY_SEPARATOR.""))
{
mkdir(dirname($_SERVER["PHP_SELF"]).DIRECTORY_SEPARATOR."log".DIRECTORY_SEPARATOR."".date("Y")."_".date("m")."_".date("d")."".DIRECTORY_SEPARATOR."",0777);
}
$folderLogName = dirname($_SERVER["PHP_SELF"]).DIRECTORY_SEPARATOR."log".DIRECTORY_SEPARATOR."".date("Y")."_".date("m")."_".date("d")."".DIRECTORY_SEPARATOR."";
$_ENV['log'] = $folderLogName."full_text_".date("Y")."_".date("m")."_".date("d")." ".date("H")."-".date("i")."-".date("s").".log";
writeLog("Application start with : ".$_SERVER['SCRIPT_FILENAME']);
}
/**
* Write on the log file
* @param $eventInfo string text which is written in the log file
*/
function writeLog($EventInfo)
{
$logFileOpened = fopen($_ENV['log'], "a");
fwrite($logFileOpened, "[".date("d")."/".date("m")."/".date("Y")." ".date("H").":".date("i").":".date("s")."] ".$EventInfo."\r\n");
fclose($logFileOpened);
}
/**
* Managing of errors
* @param $errno integer number of the error
* @param $errstr string text of the error
* @param $errfile string file concerned with the error
* @param $errline integer line of the error
* @param $errcontext string context of the error
*/
function errorHandler($errno, $errstr, $errfile, $errline, $errcontext)
{
writeLog("[ERROR] from line ".$errline." : ". $errstr." [ERROR]");
$_ENV['ErrorLevel'] = 1;
}
/**
* Check if a folder is empty
* @param $dir string path of the directory to chek
* @return boolean true if the directory exists
*/
function isDirEmpty($dir)
{
$dir = opendir($dir);
$isEmpty = true;
while(($entry = readdir($dir)) !== false)
{
if($entry !== '.' && $entry !== '..')
{
$isEmpty = false;
break;
}
}
closedir($dir);
return $isEmpty;
}
/**
* Launch the lucene engine if it's a pdf file
* @param $pathToFile string path of the file to index
* @param $indexFileDirectory string directory of the lucene index
* @param $format string format of the document to index
* @param $id integer id of the document to index
* @return integer user exit code stored in fulltext_result column of the document in "res_x"
*/
Function indexFullText($pathToFile, $indexFileDirectory, $format, $Id)
{
$result = -1;
if (is_file($pathToFile))
{
switch (strtoupper($format))
{
case "PDF":
writeLog("it's a PDF file");
$result = indexFullTextPdf($pathToFile, $indexFileDirectory, $Id);
break;
default:
$result = -2;
}
}
return $result;
}
/**
* Retrieve the text of a pdftext and launch the lucene engine
* @param $pathToFile string path of the file to index
* @param $indexFileDirectory string directory of the lucene index
* @param $id integer id of the document to index
* @return integer user exit code is stored in fulltext_result column of the document in "res_x"
*/
Function indexFullTextPdf($pathToFile, $indexFileDirectory, $Id)
{
$result = -1;
if(is_file($pathToFile))
{
$tmpFile = getcwd().DIRECTORY_SEPARATOR."..".DIRECTORY_SEPARATOR."tmp".DIRECTORY_SEPARATOR.basename($pathToFile).".ftx";
$pathToFile = str_replace("\\\\", "\\", $pathToFile);
if($_ENV['osname'] == "WINDOWS")
{
$resultExtraction = exec("\""."\"".getcwd().DIRECTORY_SEPARATOR."..".DIRECTORY_SEPARATOR."pdftotext".DIRECTORY_SEPARATOR.$_ENV['pdftotext']."\" \"".$pathToFile."\" \"".$tmpFile."\""."\"");
}
elseif($_ENV['osname'] == "UNIX")
{
$resultExtraction = exec("pdftotext \"".$pathToFile."\" \"".$tmpFile."\"");
writeLog("pdftotext \"".$pathToFile."\" \"".$tmpFile."\"");
}
$fileContent = trim(readFileF($tmpFile));
if(is_file($tmpFile)) unlink($tmpFile);
if(strlen($fileContent) > 1)
{
// Storing text in lucene index
set_include_path(getcwd().DIRECTORY_SEPARATOR."..".DIRECTORY_SEPARATOR.PATH_SEPARATOR.get_include_path());
require_once("Zend/Search/Lucene.php");
if(!is_dir($indexFileDirectory))
{
writeLog($indexFileDirectory." not exists !");
$index = Zend_Search_Lucene::create($indexFileDirectory);
}
else
{
if(isDirEmpty($indexFileDirectory))
{
writeLog($indexFileDirectory." empty !");
$index = Zend_Search_Lucene::create($indexFileDirectory);
}
else
{
$index = Zend_Search_Lucene::open($indexFileDirectory);
}
}
$term = new Zend_Search_Lucene_Index_Term($Id, 'Id');
foreach($index->termDocs($term) as $id)
{
$index->delete($id);
}
$doc = new Zend_Search_Lucene_Document();
$doc->addField(Zend_Search_Lucene_Field::UnIndexed('Id', $Id));
$doc->addField(Zend_Search_Lucene_Field::UnStored('contents', $fileContent));
$index->addDocument($doc);
$index->commit();
$result = 1;
}
else
{
$result = 2;
}
}
return $result;
}
/**
* Read a txt file
* @param $file string path of the file to read
* @return string contents of the file
*/
Function readFileF($file)
{
$result = "";
if(is_file($file))
{
$fp = fopen($file, "r");
$result = fread($fp, filesize($file));
fclose($fp);
}
Return $result;
}
// Begin
date_default_timezone_set('Europe/Paris');
if($argc != 2 )
{
echo "You must specify the configuration file." . $argc;
exit;
}
$conf = $argv[1];
//Load Fulltext Conf File
$xmlconfig = simplexml_load_file($conf);
foreach($xmlconfig->CONFIG as $CONFIG)
{
$base_directory = $CONFIG->BASE_DIRECTORY;
$_ENV['tablename'] = $CONFIG->TABLE_NAME;
$fulltextColumnName = $CONFIG->FULLTEXT_COLUMN_NAME;
$_ENV['max_batch_size'] = $CONFIG->MAX_BATCH_SIZE;
}
//Load LetterBox Conf File
$letterbox_conf = simplexml_load_file(getcwd().DIRECTORY_SEPARATOR."..".DIRECTORY_SEPARATOR."..".DIRECTORY_SEPARATOR."xml".DIRECTORY_SEPARATOR."config.xml");
foreach($letterbox_conf->CONFIG as $LTB_CONFIG)
{
$indexFileDirectory = getcwd().DIRECTORY_SEPARATOR."..".DIRECTORY_SEPARATOR."..".DIRECTORY_SEPARATOR.$LTB_CONFIG->path_to_lucene_index;
$_SESSION['config']['databaseserver'] = $LTB_CONFIG->databaseserver;
$_SESSION['config']['databasename'] = $LTB_CONFIG->databasename;
$_SESSION['config']['databaseuser'] = $LTB_CONFIG->databaseuser;
$_SESSION['config']['databasepassword'] = $LTB_CONFIG->databasepassword;
$_SESSION['config']['force_client_utf8'] = $LTB_CONFIG->force_client_utf8;
}
if(DIRECTORY_SEPARATOR == "/")
{
$_ENV['osname'] = "UNIX";
$_ENV['pdftotext'] = "pdftotext";
}
else
{
$_ENV['osname'] = "WINDOWS";
$_ENV['pdftotext'] = "pdftotext.exe";
}
loginCreation();
writeLog("Launch of Lucene full text engine");
writeLog("Loading the xml config file");
writeLog("Full text engine launched for table : ".$_ENV['tablename']);
set_include_path (getcwd().DIRECTORY_SEPARATOR."..".DIRECTORY_SEPARATOR."..".DIRECTORY_SEPARATOR.PATH_SEPARATOR.get_include_path());
require("class_functions.php");
require("class_db.php");
$_ENV['db'] = new dbquery();
$_ENV['db']->connect();
$_ENV['db2'] = new dbquery();
$_ENV['db2']->connect();
writeLog("connection on the DB server OK !");
$docServers = "select docserver_id, path_template from docservers";
$_ENV['db']->query($docServers);
writeLog("docServers found : ");
while($queryResult=$_ENV['db']->fetch_array())
{
$pathToDocServer[$queryResult[0]] = $queryResult[1];
writeLog($queryResult[1]);
}
$queryIndexFullText = "select res_id, docserver_id, path, filename, format from ".$_ENV['tablename']." where ".$fulltextColumnName." = 0 or ".$fulltextColumnName." = '' or ".$fulltextColumnName." is null ";
writeLog("query to found document with no full text : ".$queryIndexFullText);
$_ENV['db']->query($queryIndexFullText);
$cpt_batch_size=0;
writeLog("max_batch_size : ".$_ENV['max_batch_size']);
while($queryResult=$_ENV['db']->fetch_array())
{
if($_ENV['max_batch_size'] >= $cpt_batch_size)
{
$pathToFile = $pathToDocServer[$queryResult[1]] . str_replace("#", DIRECTORY_SEPARATOR, $queryResult[2]) . DIRECTORY_SEPARATOR . $queryResult[3];
writeLog("processing of document : ".$pathToFile." | res_id : ". $queryResult[0]);
echo "processing of document : ".$pathToFile." \r\n res_id : ". $queryResult[0]."\n";
$result = indexFullText($pathToFile, $indexFileDirectory, $queryResult[4], $queryResult[0]);
writeLog("Result of processing : ".$result);
echo "Result of processing : ".$result."\r\n";
$updateDoc = "update ".$_ENV['tablename']." SET ".$fulltextColumnName." = '".$result."' where res_id = ".$queryResult[0];
$queryUpdate = $_ENV['db2']->query($updateDoc);;
}
else
{
writeLog("Max batch size ! Stop processing !");
echo "\r\nMax batch size ! Stop processing !";
break;
}
$cpt_batch_size++;
}
writeLog("Return execution code : ".$_ENV['ErrorLevel']);
writeLog("End of application !");
exit($_ENV['ErrorLevel']);
?>