diff --git a/Classes/PHPExcel/Reader/HTML.php b/Classes/PHPExcel/Reader/HTML.php index 6adcc85..df389e7 100644 --- a/Classes/PHPExcel/Reader/HTML.php +++ b/Classes/PHPExcel/Reader/HTML.php @@ -1,4 +1,5 @@ array( 'font' => array( 'bold' => true, - 'size' => 24, - ), - ), // Bold, 24pt - 'h2' => array( 'font' => array( 'bold' => true, - 'size' => 18, - ), - ), // Bold, 18pt - 'h3' => array( 'font' => array( 'bold' => true, - 'size' => 13.5, - ), - ), // Bold, 13.5pt - 'h4' => array( 'font' => array( 'bold' => true, - 'size' => 12, - ), - ), // Bold, 12pt - 'h5' => array( 'font' => array( 'bold' => true, - 'size' => 10, - ), - ), // Bold, 10pt - 'h6' => array( 'font' => array( 'bold' => true, - 'size' => 7.5, - ), - ), // Bold, 7.5pt - 'a' => array( 'font' => array( 'underline' => true, - 'color' => array( 'argb' => PHPExcel_Style_Color::COLOR_BLUE, - ), - ), - ), // Blue underlined - 'hr' => array( 'borders' => array( 'bottom' => array( 'style' => PHPExcel_Style_Border::BORDER_THIN, - 'color' => array( PHPExcel_Style_Color::COLOR_BLACK, - ), - ), - ), - ), // Bottom border - ); + /** + * Sheet index to read + * + * @var int + */ + protected $_sheetIndex = 0; + /** + * Formats + * + * @var array + */ + protected $_formats = array( + 'h1' => array('font' => array('bold' => true, + 'size' => 24, + ), + ), // Bold, 24pt + 'h2' => array('font' => array('bold' => true, + 'size' => 18, + ), + ), // Bold, 18pt + 'h3' => array('font' => array('bold' => true, + 'size' => 13.5, + ), + ), // Bold, 13.5pt + 'h4' => array('font' => array('bold' => true, + 'size' => 12, + ), + ), // Bold, 12pt + 'h5' => array('font' => array('bold' => true, + 'size' => 10, + ), + ), // Bold, 10pt + 'h6' => array('font' => array('bold' => true, + 'size' => 7.5, + ), + ), // Bold, 7.5pt + 'a' => array('font' => array('underline' => true, + 'color' => array('argb' => PHPExcel_Style_Color::COLOR_BLUE, + ), + ), + ), // Blue underlined + 'hr' => array('borders' => array('bottom' => array('style' => PHPExcel_Style_Border::BORDER_THIN, + 'color' => array(\PHPExcel_Style_Color::COLOR_BLACK, + ), + ), + ), + ), // Bottom border + ); - /** - * Create a new PHPExcel_Reader_HTML - */ - public function __construct() { - $this->_readFilter = new PHPExcel_Reader_DefaultReadFilter(); - } + protected $rowspan = array(); - /** - * Validate that the current file is an HTML file - * - * @return boolean - */ - protected function _isValidFormat() - { - // Reading 2048 bytes should be enough to validate that the format is HTML - $data = fread($this->_fileHandle, 2048); - if ((strpos($data, '<') !== FALSE) && - (strlen($data) !== strlen(strip_tags($data)))) { - return TRUE; - } + /** + * Create a new PHPExcel_Reader_HTML + */ + public function __construct() + { + $this->_readFilter = new PHPExcel_Reader_DefaultReadFilter(); + } - return FALSE; - } + /** + * Validate that the current file is an HTML file + * + * @return boolean + */ + protected function _isValidFormat() + { + // Reading 2048 bytes should be enough to validate that the format is HTML + $data = fread($this->_fileHandle, 2048); + if ((strpos($data, '<') !== FALSE) && + (strlen($data) !== strlen(strip_tags($data)))) { + return TRUE; + } - /** - * Loads PHPExcel from file - * - * @param string $pFilename - * @return PHPExcel - * @throws PHPExcel_Reader_Exception - */ - public function load($pFilename) - { - // Create new PHPExcel - $objPHPExcel = new PHPExcel(); + return FALSE; + } - // Load into this instance - return $this->loadIntoExisting($pFilename, $objPHPExcel); - } + /** + * Loads PHPExcel from file + * + * @param string $pFilename + * @return PHPExcel + * @throws PHPExcel_Reader_Exception + */ + public function load($pFilename) + { + // Create new PHPExcel + $objPHPExcel = new PHPExcel(); - /** - * Set input encoding - * - * @param string $pValue Input encoding - */ - public function setInputEncoding($pValue = 'ANSI') - { - $this->_inputEncoding = $pValue; - return $this; - } + // Load into this instance + return $this->loadIntoExisting($pFilename, $objPHPExcel); + } - /** - * Get input encoding - * - * @return string - */ - public function getInputEncoding() - { - return $this->_inputEncoding; - } + /** + * Set input encoding + * + * @param string $pValue Input encoding + */ + public function setInputEncoding($pValue = 'ANSI') + { + $this->_inputEncoding = $pValue; - // Data Array used for testing only, should write to PHPExcel object on completion of tests - private $_dataArray = array(); + return $this; + } - private $_tableLevel = 0; - private $_nestedColumn = array('A'); + /** + * Get input encoding + * + * @return string + */ + public function getInputEncoding() + { + return $this->_inputEncoding; + } - private function _setTableStartColumn($column) { - if ($this->_tableLevel == 0) - $column = 'A'; - ++$this->_tableLevel; - $this->_nestedColumn[$this->_tableLevel] = $column; + // Data Array used for testing only, should write to PHPExcel object on completion of tests + protected $_dataArray = array(); + protected $_tableLevel = 0; + protected $_nestedColumn = array('A'); - return $this->_nestedColumn[$this->_tableLevel]; - } + protected function _setTableStartColumn($column) + { + if ($this->_tableLevel == 0) + $column = 'A'; + ++$this->_tableLevel; + $this->_nestedColumn[$this->_tableLevel] = $column; - private function _getTableStartColumn() { - return $this->_nestedColumn[$this->_tableLevel]; - } + return $this->_nestedColumn[$this->_tableLevel]; + } - private function _releaseTableStartColumn() { - --$this->_tableLevel; - return array_pop($this->_nestedColumn); - } + protected function _getTableStartColumn() + { + return $this->_nestedColumn[$this->_tableLevel]; + } - private function _flushCell($sheet,$column,$row,&$cellContent) { - if (is_string($cellContent)) { - // Simple String content - if (trim($cellContent) > '') { - // Only actually write it if there's content in the string + protected function _releaseTableStartColumn() + { + --$this->_tableLevel; + + return array_pop($this->_nestedColumn); + } + + protected function _flushCell($sheet, $column, $row, &$cellContent) + { + if (is_string($cellContent)) { + // Simple String content + if (trim($cellContent) > '') { + // Only actually write it if there's content in the string // echo 'FLUSH CELL: ' , $column , $row , ' => ' , $cellContent , '
'; - // Write to worksheet to be done here... - // ... we return the cell so we can mess about with styles more easily - $cell = $sheet->setCellValue($column.$row,$cellContent,true); - $this->_dataArray[$row][$column] = $cellContent; - } - } else { - // We have a Rich Text run - // TODO - $this->_dataArray[$row][$column] = 'RICH TEXT: ' . $cellContent; - } - $cellContent = (string) ''; - } + // Write to worksheet to be done here... + // ... we return the cell so we can mess about with styles more easily + $sheet->setCellValue($column . $row, $cellContent, true); + $this->_dataArray[$row][$column] = $cellContent; + } + } else { + // We have a Rich Text run + // TODO + $this->_dataArray[$row][$column] = 'RICH TEXT: ' . $cellContent; + } + $cellContent = (string) ''; + } - private function _processDomElement(DOMNode $element, $sheet, &$row, &$column, &$cellContent){ - foreach($element->childNodes as $child){ - if ($child instanceof DOMText) { - $domText = preg_replace('/\s+/',' ',trim($child->nodeValue)); - if (is_string($cellContent)) { - // simply append the text if the cell content is a plain text string - $cellContent .= $domText; - } else { - // but if we have a rich text run instead, we need to append it correctly - // TODO - } - } elseif($child instanceof DOMElement) { + protected function _processDomElement(DOMNode $element, $sheet, &$row, &$column, &$cellContent, $format = null) + { + foreach ($element->childNodes as $child) { + if ($child instanceof DOMText) { + $domText = preg_replace('/\s+/', ' ', trim($child->nodeValue)); + if (is_string($cellContent)) { + // simply append the text if the cell content is a plain text string + $cellContent .= $domText; + } else { + // but if we have a rich text run instead, we need to append it correctly + // TODO + } + } elseif ($child instanceof DOMElement) { // echo 'DOM ELEMENT: ' , strtoupper($child->nodeName) , '
'; - $attributeArray = array(); - foreach($child->attributes as $attribute) { + $attributeArray = array(); + foreach ($child->attributes as $attribute) { // echo 'ATTRIBUTE: ' , $attribute->name , ' => ' , $attribute->value , '
'; - $attributeArray[$attribute->name] = $attribute->value; - } + $attributeArray[$attribute->name] = $attribute->value; + } - switch($child->nodeName) { - case 'meta' : - foreach($attributeArray as $attributeName => $attributeValue) { - switch($attributeName) { - case 'content': - // TODO - // Extract character set, so we can convert to UTF-8 if required - break; - } - } - $this->_processDomElement($child,$sheet,$row,$column,$cellContent); - break; - case 'title' : - $this->_processDomElement($child,$sheet,$row,$column,$cellContent); - $sheet->setTitle($cellContent); - $cellContent = ''; - break; - case 'span' : - case 'div' : - case 'font' : - case 'i' : - case 'em' : - case 'strong': - case 'b' : + switch ($child->nodeName) { + case 'meta' : + foreach ($attributeArray as $attributeName => $attributeValue) { + switch ($attributeName) { + case 'content': + // TODO + // Extract character set, so we can convert to UTF-8 if required + break; + } + } + $this->_processDomElement($child, $sheet, $row, $column, $cellContent); + break; + case 'title' : + $this->_processDomElement($child, $sheet, $row, $column, $cellContent); + $sheet->setTitle($cellContent); + $cellContent = ''; + break; + case 'span' : + case 'div' : + case 'font' : + case 'i' : + case 'em' : + case 'strong': + case 'b' : // echo 'STYLING, SPAN OR DIV
'; - if ($cellContent > '') - $cellContent .= ' '; - $this->_processDomElement($child,$sheet,$row,$column,$cellContent); - if ($cellContent > '') - $cellContent .= ' '; + if ($cellContent > '') + $cellContent .= ' '; + $this->_processDomElement($child, $sheet, $row, $column, $cellContent); + if ($cellContent > '') + $cellContent .= ' '; // echo 'END OF STYLING, SPAN OR DIV
'; - break; - case 'hr' : - $this->_flushCell($sheet,$column,$row,$cellContent); - ++$row; - if (isset($this->_formats[$child->nodeName])) { - $sheet->getStyle($column.$row)->applyFromArray($this->_formats[$child->nodeName]); - } else { - $cellContent = '----------'; - $this->_flushCell($sheet,$column,$row,$cellContent); - } - ++$row; - case 'br' : - if ($this->_tableLevel > 0) { - // If we're inside a table, replace with a \n - $cellContent .= "\n"; - } else { - // Otherwise flush our existing content and move the row cursor on - $this->_flushCell($sheet,$column,$row,$cellContent); - ++$row; - } + break; + case 'hr' : + $this->_flushCell($sheet, $column, $row, $cellContent); + ++$row; + if (isset($this->_formats[$child->nodeName])) { + $sheet->getStyle($column . $row)->applyFromArray($this->_formats[$child->nodeName]); + } else { + $cellContent = '----------'; + $this->_flushCell($sheet, $column, $row, $cellContent); + } + ++$row; + case 'br' : + if ($this->_tableLevel > 0) { + // If we're inside a table, replace with a \n + $cellContent .= "\n"; + } else { + // Otherwise flush our existing content and move the row cursor on + $this->_flushCell($sheet, $column, $row, $cellContent); + ++$row; + } // echo 'HARD LINE BREAK: ' , '
'; - break; - case 'a' : + break; + case 'a' : // echo 'START OF HYPERLINK: ' , '
'; - foreach($attributeArray as $attributeName => $attributeValue) { - switch($attributeName) { - case 'href': + foreach ($attributeArray as $attributeName => $attributeValue) { + switch ($attributeName) { + case 'href': // echo 'Link to ' , $attributeValue , '
'; - $sheet->getCell($column.$row)->getHyperlink()->setUrl($attributeValue); - if (isset($this->_formats[$child->nodeName])) { - $sheet->getStyle($column.$row)->applyFromArray($this->_formats[$child->nodeName]); - } - break; - } - } - $cellContent .= ' '; - $this->_processDomElement($child,$sheet,$row,$column,$cellContent); + $sheet->getCell($column . $row)->getHyperlink()->setUrl($attributeValue); + if (isset($this->_formats[$child->nodeName])) { + $sheet->getStyle($column . $row)->applyFromArray($this->_formats[$child->nodeName]); + } + break; + } + } + $cellContent .= ' '; + $this->_processDomElement($child, $sheet, $row, $column, $cellContent); // echo 'END OF HYPERLINK:' , '
'; - break; - case 'h1' : - case 'h2' : - case 'h3' : - case 'h4' : - case 'h5' : - case 'h6' : - case 'ol' : - case 'ul' : - case 'p' : - if ($this->_tableLevel > 0) { - // If we're inside a table, replace with a \n - $cellContent .= "\n"; + break; + case 'h1' : + case 'h2' : + case 'h3' : + case 'h4' : + case 'h5' : + case 'h6' : + case 'ol' : + case 'ul' : + case 'p' : + if ($this->_tableLevel > 0) { + // If we're inside a table, replace with a \n + $cellContent .= "\n"; // echo 'LIST ENTRY: ' , '
'; - $this->_processDomElement($child,$sheet,$row,$column,$cellContent); + $this->_processDomElement($child, $sheet, $row, $column, $cellContent); // echo 'END OF LIST ENTRY:' , '
'; - } else { - if ($cellContent > '') { - $this->_flushCell($sheet,$column,$row,$cellContent); - $row += 2; - } + } else { + if ($cellContent > '') { + $this->_flushCell($sheet, $column, $row, $cellContent); + $row++; + } // echo 'START OF PARAGRAPH: ' , '
'; - $this->_processDomElement($child,$sheet,$row,$column,$cellContent); + $this->_processDomElement($child, $sheet, $row, $column, $cellContent); // echo 'END OF PARAGRAPH:' , '
'; - $this->_flushCell($sheet,$column,$row,$cellContent); + $this->_flushCell($sheet, $column, $row, $cellContent); - if (isset($this->_formats[$child->nodeName])) { - $sheet->getStyle($column.$row)->applyFromArray($this->_formats[$child->nodeName]); - } + if (isset($this->_formats[$child->nodeName])) { + $sheet->getStyle($column . $row)->applyFromArray($this->_formats[$child->nodeName]); + } - $row += 2; - $column = 'A'; - } - break; - case 'li' : - if ($this->_tableLevel > 0) { - // If we're inside a table, replace with a \n - $cellContent .= "\n"; + $row++; + $column = 'A'; + } + break; + case 'li' : + if ($this->_tableLevel > 0) { + // If we're inside a table, replace with a \n + $cellContent .= "\n"; // echo 'LIST ENTRY: ' , '
'; - $this->_processDomElement($child,$sheet,$row,$column,$cellContent); + $this->_processDomElement($child, $sheet, $row, $column, $cellContent); // echo 'END OF LIST ENTRY:' , '
'; - } else { - if ($cellContent > '') { - $this->_flushCell($sheet,$column,$row,$cellContent); - } - ++$row; + } else { + if ($cellContent > '') { + $this->_flushCell($sheet, $column, $row, $cellContent); + } + ++$row; // echo 'LIST ENTRY: ' , '
'; - $this->_processDomElement($child,$sheet,$row,$column,$cellContent); + $this->_processDomElement($child, $sheet, $row, $column, $cellContent); // echo 'END OF LIST ENTRY:' , '
'; - $this->_flushCell($sheet,$column,$row,$cellContent); - $column = 'A'; - } - break; - case 'table' : - $this->_flushCell($sheet,$column,$row,$cellContent); - $column = $this->_setTableStartColumn($column); + $this->_flushCell($sheet, $column, $row, $cellContent); + $column = 'A'; + } + break; + case 'table' : + $this->_flushCell($sheet, $column, $row, $cellContent); + $column = $this->_setTableStartColumn($column); // echo 'START OF TABLE LEVEL ' , $this->_tableLevel , '
'; - if ($this->_tableLevel > 1) - --$row; - $this->_processDomElement($child,$sheet,$row,$column,$cellContent); + if ($this->_tableLevel > 1) + --$row; + $this->_processDomElement($child, $sheet, $row, $column, $cellContent); // echo 'END OF TABLE LEVEL ' , $this->_tableLevel , '
'; - $column = $this->_releaseTableStartColumn(); - if ($this->_tableLevel > 1) { - ++$column; - } else { - ++$row; - } - break; - case 'thead' : - case 'tbody' : - $this->_processDomElement($child,$sheet,$row,$column,$cellContent); - break; - case 'tr' : - ++$row; - $column = $this->_getTableStartColumn(); - $cellContent = ''; + $column = $this->_releaseTableStartColumn(); + if ($this->_tableLevel > 1) { + ++$column; + } else { + ++$row; + } + break; + case 'thead' : + case 'tbody' : + $this->_processDomElement($child, $sheet, $row, $column, $cellContent); + break; + case 'tr' : + $column = $this->_getTableStartColumn(); + $cellContent = ''; // echo 'START OF TABLE ' , $this->_tableLevel , ' ROW
'; - $this->_processDomElement($child,$sheet,$row,$column,$cellContent); + $this->_processDomElement($child, $sheet, $row, $column, $cellContent); + ++$row; // echo 'END OF TABLE ' , $this->_tableLevel , ' ROW
'; - break; - case 'th' : - case 'td' : + break; + case 'th' : + case 'td' : // echo 'START OF TABLE ' , $this->_tableLevel , ' CELL
'; - $this->_processDomElement($child,$sheet,$row,$column,$cellContent); + $this->_processDomElement($child, $sheet, $row, $column, $cellContent); // echo 'END OF TABLE ' , $this->_tableLevel , ' CELL
'; - $this->_flushCell($sheet,$column,$row,$cellContent); - ++$column; - break; - case 'body' : - $row = 1; - $column = 'A'; - $content = ''; - $this->_tableLevel = 0; - $this->_processDomElement($child,$sheet,$row,$column,$cellContent); - break; - default: - $this->_processDomElement($child,$sheet,$row,$column,$cellContent); - } - } - } - } - /** - * Loads PHPExcel from file into PHPExcel instance - * - * @param string $pFilename - * @param PHPExcel $objPHPExcel - * @return PHPExcel - * @throws PHPExcel_Reader_Exception - */ - public function loadIntoExisting($pFilename, PHPExcel $objPHPExcel) - { - // Open file to validate - $this->_openFile($pFilename); - if (!$this->_isValidFormat()) { - fclose ($this->_fileHandle); - throw new PHPExcel_Reader_Exception($pFilename . " is an Invalid HTML file."); - } - // Close after validating - fclose ($this->_fileHandle); + while (isset($this->rowspan[$column . $row])) { + ++$column; + } - // Create new PHPExcel - while ($objPHPExcel->getSheetCount() <= $this->_sheetIndex) { - $objPHPExcel->createSheet(); - } - $objPHPExcel->setActiveSheetIndex( $this->_sheetIndex ); + $this->_flushCell($sheet, $column, $row, $cellContent); - // Create a new DOM object - $dom = new DOMDocument; - // Reload the HTML file into the DOM object - if ((version_compare(PHP_VERSION, '5.4.0') >= 0) && defined(LIBXML_DTDLOAD)) { - $loaded = $dom->loadHTMLFile($pFilename, PHPExcel_Settings::getLibXmlLoaderOptions()); - } else { - $loaded = $dom->loadHTMLFile($pFilename); + if (isset($attributeArray['style']) && !empty($attributeArray['style'])) { + $styleAry = $this->getPhpExcelStyleArray($attributeArray['style']); + + if (!empty($styleAry)) { + $sheet->getStyle($column . $row)->applyFromArray($styleAry); + } + } + + //create merging rowspan + if (isset($attributeArray['rowspan']) && isset($attributeArray['colspan'])) { + $columnTo = $column; + for ($i = 0; $i < $attributeArray['colspan'] - 1; $i++) { + ++$columnTo; + } + $range = $column . $row . ':' . $columnTo . ($row + $attributeArray['rowspan'] - 1); + foreach (\PHPExcel_Cell::extractAllCellReferencesInRange($range) as $value) { + $this->rowspan[$value] = true; + } + $sheet->mergeCells($range); + } elseif (isset($attributeArray['rowspan'])) { + $range = $column . $row . ':' . $column . ($row + $attributeArray['rowspan'] - 1); + foreach (\PHPExcel_Cell::extractAllCellReferencesInRange($range) as $value) { + $this->rowspan[$value] = true; + } + $sheet->mergeCells($range); + } elseif (isset($attributeArray['colspan'])) { + $columnTo = $column; + for ($i = 0; $i < $attributeArray['colspan'] - 1; $i++) { + ++$columnTo; + } + $sheet->mergeCells($column . $row . ':' . $columnTo . $row); + $column = $columnTo; + } + ++$column; + break; + case 'body' : + $row = 1; + $column = 'A'; + $content = ''; + $this->_tableLevel = 0; + $this->_processDomElement($child, $sheet, $row, $column, $cellContent); + break; + default: + $this->_processDomElement($child, $sheet, $row, $column, $cellContent); + } + } } - if ($loaded === FALSE) { - throw new PHPExcel_Reader_Exception('Failed to load '. $pFilename. ' as a DOM Document'); - } + } - // Discard white space - $dom->preserveWhiteSpace = false; + /** + * Loads PHPExcel from file into PHPExcel instance + * + * @param string $pFilename + * @param PHPExcel $objPHPExcel + * @return PHPExcel + * @throws PHPExcel_Reader_Exception + */ + public function loadIntoExisting($pFilename, PHPExcel $objPHPExcel) + { + // Open file to validate + $this->_openFile($pFilename); + if (!$this->_isValidFormat()) { + fclose($this->_fileHandle); + throw new PHPExcel_Reader_Exception($pFilename . " is an Invalid HTML file."); + } + // Close after validating + fclose($this->_fileHandle); + // Create new PHPExcel + while ($objPHPExcel->getSheetCount() <= $this->_sheetIndex) { + $objPHPExcel->createSheet(); + } + $objPHPExcel->setActiveSheetIndex($this->_sheetIndex); - $row = 0; - $column = 'A'; - $content = ''; - $this->_processDomElement($dom,$objPHPExcel->getActiveSheet(),$row,$column,$content); + // Create a new DOM object + $dom = new domDocument; + // Reload the HTML file into the DOM object + $loaded = $dom->loadHTMLFile($pFilename); + if ($loaded === FALSE) { + throw new PHPExcel_Reader_Exception('Failed to load ', $pFilename, ' as a DOM Document'); + } + + // Discard white space + $dom->preserveWhiteSpace = false; + + $row = 0; + $column = 'A'; + $content = ''; + $this->_processDomElement($dom, $objPHPExcel->getActiveSheet(), $row, $column, $content); // Return - return $objPHPExcel; - } + return $objPHPExcel; + } - /** - * Get sheet index - * - * @return int - */ - public function getSheetIndex() { - return $this->_sheetIndex; - } + /** + * Get sheet index + * + * @return int + */ + public function getSheetIndex() + { + return $this->_sheetIndex; + } - /** - * Set sheet index - * - * @param int $pValue Sheet index - * @return PHPExcel_Reader_HTML - */ - public function setSheetIndex($pValue = 0) { - $this->_sheetIndex = $pValue; - return $this; - } + /** + * Set sheet index + * + * @param int $pValue Sheet index + * @return PHPExcel_Reader_HTML + */ + public function setSheetIndex($pValue = 0) + { + $this->_sheetIndex = $pValue; + + return $this; + } } + diff --git a/changelog.txt b/changelog.txt index 0b8845f..587059b 100644 --- a/changelog.txt +++ b/changelog.txt @@ -49,6 +49,7 @@ Planned for v1.8.1 - Feature: (CQD) Work Item GH-389 - Additional Mac CJK codepage definitions - Feature: (bolovincev) Work Item GH-269 - Update Worksheet.php getStyleByColumnAndRow() to allow a range of cells rather than just a single cell - Feature: (MBaker) - New methods added for testing cell status within merge groups +- Feature: (cifren/MBaker) Work Item GH-205 - Handling merge cells in HTML Reader 2014-03-02 (v1.8.0):