it-swarm.dev

Come estrarre il testo da file Word. Doc, docx, .xlsx, .pptx php

Ci può essere uno scenario che dobbiamo ottenere il testo dai documenti di Word per l'uso futuro per cercare la stringa nel documento caricato dall'utente come per la ricerca in cv/curriculum e si verifica un problema comune su come ottenere il testo, Apri e leggi un utente ha caricato un documento Word, ci sono alcuni link utili ma non cura l'intero problema. Abbiamo bisogno di ottenere il testo al momento del caricamento e salvare il testo nel database e possiamo facilmente cercare all'interno del database.

41
M Khalid Junaid

Ecco una semplice classe che fa il lavoro giusto per .doc/.docx, PHP docx reader: converti file MS Word Docx in testo .

    class DocxConversion{
    private $filename;

    public function __construct($filePath) {
        $this->filename = $filePath;
    }

    private function read_doc() {
        $fileHandle = fopen($this->filename, "r");
        $line = @fread($fileHandle, filesize($this->filename));   
        $lines = explode(chr(0x0D),$line);
        $outtext = "";
        foreach($lines as $thisline)
          {
            $pos = strpos($thisline, chr(0x00));
            if (($pos !== FALSE)||(strlen($thisline)==0))
              {
              } else {
                $outtext .= $thisline." ";
              }
          }
         $outtext = preg_replace("/[^a-zA-Z0-9\s\,\.\-\n\r\[email protected]\/\_\(\)]/","",$outtext);
        return $outtext;
    }

    private function read_docx(){

        $striped_content = '';
        $content = '';

        $Zip = Zip_open($this->filename);

        if (!$Zip || is_numeric($Zip)) return false;

        while ($Zip_entry = Zip_read($Zip)) {

            if (Zip_entry_open($Zip, $Zip_entry) == FALSE) continue;

            if (Zip_entry_name($Zip_entry) != "Word/document.xml") continue;

            $content .= Zip_entry_read($Zip_entry, Zip_entry_filesize($Zip_entry));

            Zip_entry_close($Zip_entry);
        }// end while

        Zip_close($Zip);

        $content = str_replace('</w:r></w:p></w:tc><w:tc>', " ", $content);
        $content = str_replace('</w:r></w:p>', "\r\n", $content);
        $striped_content = strip_tags($content);

        return $striped_content;
    }

 /************************Excel sheet************************************/

function xlsx_to_text($input_file){
    $xml_filename = "xl/sharedStrings.xml"; //content file name
    $Zip_handle = new ZipArchive;
    $output_text = "";
    if(true === $Zip_handle->open($input_file)){
        if(($xml_index = $Zip_handle->locateName($xml_filename)) !== false){
            $xml_datas = $Zip_handle->getFromIndex($xml_index);
            $xml_handle = DOMDocument::loadXML($xml_datas, LIBXML_NOENT | LIBXML_XINCLUDE | LIBXML_NOERROR | LIBXML_NOWARNING);
            $output_text = strip_tags($xml_handle->saveXML());
        }else{
            $output_text .="";
        }
        $Zip_handle->close();
    }else{
    $output_text .="";
    }
    return $output_text;
}

/*************************power point files*****************************/
function pptx_to_text($input_file){
    $Zip_handle = new ZipArchive;
    $output_text = "";
    if(true === $Zip_handle->open($input_file)){
        $slide_number = 1; //loop through slide files
        while(($xml_index = $Zip_handle->locateName("ppt/slides/slide".$slide_number.".xml")) !== false){
            $xml_datas = $Zip_handle->getFromIndex($xml_index);
            $xml_handle = DOMDocument::loadXML($xml_datas, LIBXML_NOENT | LIBXML_XINCLUDE | LIBXML_NOERROR | LIBXML_NOWARNING);
            $output_text .= strip_tags($xml_handle->saveXML());
            $slide_number++;
        }
        if($slide_number == 1){
            $output_text .="";
        }
        $Zip_handle->close();
    }else{
    $output_text .="";
    }
    return $output_text;
}


    public function convertToText() {

        if(isset($this->filename) && !file_exists($this->filename)) {
            return "File Not exists";
        }

        $fileArray = pathinfo($this->filename);
        $file_ext  = $fileArray['extension'];
        if($file_ext == "doc" || $file_ext == "docx" || $file_ext == "xlsx" || $file_ext == "pptx")
        {
            if($file_ext == "doc") {
                return $this->read_doc();
            } elseif($file_ext == "docx") {
                return $this->read_docx();
            } elseif($file_ext == "xlsx") {
                return $this->xlsx_to_text();
            }elseif($file_ext == "pptx") {
                return $this->pptx_to_text();
            }
        } else {
            return "Invalid File Type";
        }
    }

}

Document_file_format I file Doc sono blob binari. Possono essere letti usando fopen . Mentre i file .docx sono solo file Zip e file xml file xml in un contenitore zipfile (fonte wikipedia) puoi leggili usando Zip_open .

Uso di sopra della classe

$docObj = new DocxConversion("test.doc");
//$docObj = new DocxConversion("test.docx");
//$docObj = new DocxConversion("test.xlsx");
//$docObj = new DocxConversion("test.pptx");
echo $docText= $docObj->convertToText();
62
M Khalid Junaid

Dal file DOC

$filename = 'ypue file';         
    if ( file_exists($filename) ) {        

    if ( ($fh = fopen($filename, 'r')) !== false ) {

    $headers = fread($fh, 0xA00);

    $n1 = ( ord($headers[0x21C]) - 1 );

    $n2 = ( ( ord($headers[0x21D]) - 8 ) * 256 );

    $n3 = ( ( ord($headers[0x21E]) * 256 ) * 256 );

    $n4 = ( ( ( ord($headers[0x21F]) * 256 ) * 256 ) * 256 );


    $textLength = ($n1 + $n2 + $n3 + $n4);

    $extracted_plaintext = fread($fh, $textLength);

    echo nl2br($extracted_plaintext);

    print_r(extract_emails_from($extracted_plaintext));    

         }

        }

    function extract_emails_from($string) {
             preg_match_all("/[\._a-zA-Z0-9-][email protected][\._a-zA-Z0-9-]+/i", $string, $matches);
             return $matches[0];
    }

Da DOCX:

    /*Name of the document file*/
    $document = 'your file';

    /**Function to extract text*/
    function extracttext($filename) {
        //Check for extension
        $ext = end(explode('.', $filename));

    //if its docx file
    if($ext == 'docx')
    $dataFile = "Word/document.xml";
    //else it must be odt file
    else
    $dataFile = "content.xml";     

    //Create a new Zip archive object
    $Zip = new ZipArchive;

    // Open the archive file
    if (true === $Zip->open($filename)) {
        // If successful, search for the data file in the archive
        if (($index = $Zip->locateName($dataFile)) !== false) {
            // Index found! Now read it to a string
            $text = $Zip->getFromIndex($index);
            // Load XML from a string
            // Ignore errors and warnings
            $xml = DOMDocument::loadXML($text, LIBXML_NOENT | LIBXML_XINCLUDE | LIBXML_NOERROR | LIBXML_NOWARNING);
            // Remove XML formatting tags and return the text
            return strip_tags($xml->saveXML());
        }
        //Close the archive file
        $Zip->close();
    }

    // In case of failure return a message
    return "File not found";
}

echo extracttext($document);
9
Jumper Pot

// Per DOCX. Se si desidera conservare spazi bianchi, occuparsi anche delle tabelle tr e tc, utilizzare i codici seguenti: Modificarlo secondo i propri gusti. Perché scarica il file da remoto o locale 

//=========DOCX===========
function extractDocxText($url,$file_name){
        $docx = get_url($url);
        file_put_contents("tempf.docx",$docx);
        $xml_filename = "Word/document.xml"; //content file name
        $Zip_handle = new ZipArchive;
        $output_text = "";
        if(true === $Zip_handle->open("tempf.docx")){
            if(($xml_index = $Zip_handle->locateName($xml_filename)) !== false){
                $xml_datas = $Zip_handle->getFromIndex($xml_index);
                //file_put_contents($input_file.".xml",$xml_datas);
                $replace_newlines = preg_replace('/<w:p w[0-9-Za-z]+:[a-zA-Z0-9]+="[a-zA-z"0-9 :="]+">/',"\n\r",$xml_datas);
                $replace_tableRows = preg_replace('/<w:tr>/',"\n\r",$replace_newlines);
                $replace_tab = preg_replace('/<w:tab\/>/',"\t",$replace_tableRows);
                $replace_paragraphs = preg_replace('/<\/w:p>/',"\n\r",$replace_tab);
                $replace_other_Tags = strip_tags($replace_paragraphs);          
                $output_text = $replace_other_Tags;
            }else{
                $output_text .="";
            }
            $Zip_handle->close();
        }else{
        $output_text .=" ";
        }
        chmod("tempf.docx", 0777);  unlink(realpath("tempf.docx"));
        //save to file or echo content
        file_put_contents($file_name,$output_text);
        echo $output_text;
    }

//========PDF===========
//Requires installation in your Linux server
//Sudo su
//apt-get install xpdf
function extractPdfText($url,$PDF_fullpath_or_Filename){
    $pdf = get_url($url);
    file_put_contents ("temppdf.txt", $pdf);
    $content = pdf2text("temppdf.txt");
    chmod("temppdf.txt", 0777); unlink(realpath("temppdf.txt"));
    echo $content;
    file_put_contents($PDF_fullpath_or_Filename,$content);
    }



//========DOC==========
function extractDocText($url,$file_name){
    $doc = get_url($url);
    file_put_contents ("tempf.txt", $doc);

    $fileHandle = fopen("tempf.txt", "r");
    $line = @fread($fileHandle, filesize("tempf.txt"));
    $lines = explode(chr(0x0D),$line);
    $outtext = "";
    foreach($lines as $thisline){
        $pos = strpos($thisline, chr(0x00));
        if (($pos !== FALSE)||(strlen($thisline)==0))
        {} else {$outtext .= $thisline."\n\r";}
        }
    $content = preg_replace('/[a-zA-Z0-9\s\,\.\-\n\r\[email protected]\/\_\(\)]/','  ',$outtext);

    //chmod("tempf.txt", 0777); unlink(realpath("tempf.txt"));
    echo $content;
    file_put_contents($file_name,$content);
    }


//========XLSX==========
function extractXlsxText($url,$file_name){
    $xlsx = get_url($url);
    file_put_contents ("tempf.txt", $xlsx);
    $content = "";
    $dir = 'tempforxlsx';
    // Unzip
    $Zip = new ZipArchive();
    $Zip->open("tempf.txt");
    $Zip->extractTo($dir);
    // Open up shared strings & the first worksheet
    $strings = simplexml_load_file($dir . '/xl/sharedStrings.xml');
    $sheet   = simplexml_load_file($dir . '/xl/worksheets/sheet1.xml');
    // Parse the rows
    $xlrows = $sheet->sheetData->row;
    foreach ($xlrows as $xlrow) {
        $arr = array();

        // In each row, grab it's value
        foreach ($xlrow->c as $cell) {
            $v = (string) $cell->v;

            // If it has a "t" (type?) of "s" (string?), use the value to look up string value
            if (isset($cell['t']) && $cell['t'] == 's') {
                $s  = array();
                $si = $strings->si[(int) $v];

                // Register & alias the default namespace or you'll get empty results in the xpath query
                $si->registerXPathNamespace('n', 'http://schemas.openxmlformats.org/spreadsheetml/2006/main');
                // Cat together all of the 't' (text?) node values
                foreach($si->xpath('.//n:t') as $t) {
                    $content .= $t."  ";}   }
            }
        }
    echo $content;
    file_put_contents($file_name,$content);
    }


//========PPT========== 
function extractPptText($url,$file_name){
    $ppt = file_get_contents($url);
    file_put_contents ("tempf.ppt", $ppt);
    $fileHandle = fopen("tempf.ppt", "r");
    $line = @fread($fileHandle, filesize("tempf.ppt"));
    $lines = explode(chr(0x0f),$line);
    $outtext = '';

    foreach($lines as $thisline) {
        if (strpos($thisline, chr(0x00).chr(0x00).chr(0x00)) == 1) {
            $text_line = substr($thisline, 4);
            $end_pos   = strpos($text_line, chr(0x00));
            $text_line = substr($text_line, 0, $end_pos);
            $text_line = preg_replace('/[^a-zA-Z0-9\s\,\.\-\n\r\[email protected]\/\_\(\)]/',"  ",$text_line);
            $outtext = substr($text_line, 0, $end_pos)."\n".$outtext;
        }
    }
    //echo $outtext;
    file_put_contents($file_name,$outtext);
    }

//========PPTX==========
function extractPptxText($url,$file_name){
    $xls = get_url($url);
    file_put_contents ("tempf.txt", $xls);
    $Zip_handle = new ZipArchive;
    $output_text = ' ';
    if(true === $Zip_handle->open("tempf.txt")){
        $slide_number = 1; //loop through slide files
        while(($xml_index = $Zip_handle->locateName("ppt/slides/slide".$slide_number.".xml")) !== false){
            $xml_datas = $Zip_handle->getFromIndex($xml_index); // these four lines of codes
                                                                // below were
            $xml_handle = new DOMDocument ();                   // added by me in order
            $xml_handle->preserveWhiteSpace = true;             // to preserve space between
            $xml_handle->formatOutput = true;                   // each read data
            $xml_handle->loadXML($xml_datas, LIBXML_NOENT | LIBXML_XINCLUDE | LIBXML_NOERROR | LIBXML_NOWARNING);
            $output_text .= $xml_handle->saveXML();
            $slide_number++;
            }
        if($slide_number == 1){
            $output_text .= "";
        }
        $Zip_handle->close();
    }else{
    $output_text .= "";
    }
    echo $output_text;
    file_put_contents($file_name,$output_text);
    }

    /*

==========================================================================
=========================================================================
And below is get_url() function: Better than fie_get_contents();
*/

function get_url( $url,$timeout = 5 )
    {
        $url = str_replace( "&amp;", "&", urldecode(trim($url)) );
        $ch = curl_init();
        curl_setopt( $ch, CURLOPT_USERAGENT, "Mozilla/5.0 (Windows; U; Windows NT 5.1; rv:1.7.3) Gecko/20041001 Firefox/0.10.1" );
        curl_setopt( $ch, CURLOPT_URL, $url );
        curl_setopt( $ch, CURLOPT_FOLLOWLOCATION, true );
        curl_setopt( $ch, CURLOPT_ENCODING, "" );
        curl_setopt( $ch, CURLOPT_RETURNTRANSFER, true );
        curl_setopt( $ch, CURLOPT_AUTOREFERER, true );
        curl_setopt( $ch, CURLOPT_SSL_VERIFYPEER, false );    # required for https urls
        curl_setopt( $ch, CURLOPT_CONNECTTIMEOUT, $timeout );
        curl_setopt( $ch, CURLOPT_TIMEOUT, $timeout );
        curl_setopt( $ch, CURLOPT_MAXREDIRS, 10 );
        $content = curl_exec( $ch );
        //$response = curl_getinfo( $ch ); 
        curl_close ( $ch );
        return $content;
    }
2
FRanklinDavid

Per i documenti docx, suggerisco l'uso dello strumento docx2txt (disponibile almeno su Debian/Ubuntu):

docx2txt < your_file.docx

README spiega come integrarlo con vim. Aggiungi al tuo .vimrc:

" use docx2txt.pl to allow VIm to view the text content of a .docx file directly.
autocmd BufReadPre *.docx set ro
autocmd BufReadPost *.docx %!docx2txt

(spiega anche come integrare con emacs). 

Per gli hacker, questo strumento è scritto in Perl.

0
Jezz