Initial modifications for XML-based sheet readers to use XMLReader via streaming for large files in the listWorksheetNames() and listWorksheetInfo() methods... still needs some additional work for merged cells to work correctly, but gives a hell of a memory boost and executes more quickly - the same principles will be applied to the core data readers, but that still requires a lot more work. However, when completed, the effort should be worth it for both speed and memory usage, and the code should be a lot cleaner for the core data reader as well

This commit is contained in:
Mark Baker 2013-01-12 12:52:00 +00:00
parent 5de1067a16
commit 0ad3f67da6
5 changed files with 629 additions and 532 deletions

View File

@ -28,7 +28,7 @@
PHPExcel_Autoloader::Register();
// As we always try to run the autoloader before anything else, we can use it to do a few
// simple checks and initialisations
PHPExcel_Shared_ZipStreamWrapper::register();
//PHPExcel_Shared_ZipStreamWrapper::register();
// check mbstring.func_overload
if (ini_get('mbstring.func_overload') & 2) {
throw new Exception('Multibyte function overloading in PHP must be disabled for string functions (2).');

View File

@ -112,6 +112,50 @@ class PHPExcel_Reader_Excel2007 extends PHPExcel_Reader_Abstract implements PHPE
}
/**
* Reads names of the worksheets from a file, without parsing the whole file to a PHPExcel object
*
* @param string $pFilename
* @throws PHPExcel_Reader_Exception
*/
public function listWorksheetNames($pFilename)
{
// Check if file exists
if (!file_exists($pFilename)) {
throw new PHPExcel_Reader_Exception("Could not open " . $pFilename . " for reading! File does not exist.");
}
$worksheetNames = array();
$zip = new ZipArchive;
$zip->open($pFilename);
// The files we're looking at here are small enough that simpleXML is more efficient than XMLReader
$rels = simplexml_load_string(
$this->_getFromZipArchive($zip, "_rels/.rels")
); //~ http://schemas.openxmlformats.org/package/2006/relationships");
foreach ($rels->Relationship as $rel) {
switch ($rel["Type"]) {
case "http://schemas.openxmlformats.org/officeDocument/2006/relationships/officeDocument":
$xmlWorkbook = simplexml_load_string(
$this->_getFromZipArchive($zip, "{$rel['Target']}")
); //~ http://schemas.openxmlformats.org/spreadsheetml/2006/main");
if ($xmlWorkbook->sheets) {
foreach ($xmlWorkbook->sheets->sheet as $eleSheet) {
// Check if sheet should be skipped
$worksheetNames[] = (string) $eleSheet["name"];
}
}
}
}
$zip->close();
return $worksheetNames;
}
/**
* Return worksheet info (Name, Last Column Letter, Last Column Index, Total Rows, Total Columns)
*
@ -148,32 +192,35 @@ class PHPExcel_Reader_Excel2007 extends PHPExcel_Reader_Abstract implements PHPE
if ($xmlWorkbook->sheets) {
$dir = dirname($rel["Target"]);
foreach ($xmlWorkbook->sheets->sheet as $eleSheet) {
$tmpInfo = array();
$tmpInfo['worksheetName'] = (string) $eleSheet["name"];
$tmpInfo['lastColumnLetter'] = 'A';
$tmpInfo['lastColumnIndex'] = 0;
$tmpInfo['totalRows'] = 0;
$tmpInfo['totalColumns'] = 0;
$tmpInfo = array(
'worksheetName' => (string) $eleSheet["name"],
'lastColumnLetter' => 'A',
'lastColumnIndex' => 0,
'totalRows' => 0,
'totalColumns' => 0,
);
$fileWorksheet = $worksheets[(string) self::array_item($eleSheet->attributes("http://schemas.openxmlformats.org/officeDocument/2006/relationships"), "id")];
$xmlSheet = simplexml_load_string($this->_getFromZipArchive($zip, "$dir/$fileWorksheet")); //~ http://schemas.openxmlformats.org/spreadsheetml/2006/main");
if ($xmlSheet && $xmlSheet->sheetData && $xmlSheet->sheetData->row) {
foreach ($xmlSheet->sheetData->row as $row) {
foreach ($row->c as $c) {
$r = (string) $c["r"];
$coordinates = PHPExcel_Cell::coordinateFromString($r);
$rowIndex = $coordinates[1];
$columnIndex = PHPExcel_Cell::columnIndexFromString($coordinates[0]) - 1;
$xml = new XMLReader();
$res = $xml->open('zip://'.realpath($pFilename).'#'."$dir/$fileWorksheet");
$xml->setParserProperty(2,true);
$tmpInfo['totalRows'] = max($tmpInfo['totalRows'], $rowIndex);
$tmpInfo['lastColumnIndex'] = max($tmpInfo['lastColumnIndex'], $columnIndex);
}
$currCells = 0;
while ($xml->read()) {
if ($xml->name == 'row' && $xml->nodeType == XMLReader::ELEMENT) {
$tmpInfo['totalRows']++;
$tmpInfo['totalColumns'] = max($tmpInfo['totalColumns'],$currCells);
$currCells = 0;
} elseif ($xml->name == 'c' && $xml->nodeType == XMLReader::ELEMENT) {
$currCells++;
}
}
$tmpInfo['totalColumns'] = max($tmpInfo['totalColumns'],$currCells);
$xml->close();
$tmpInfo['lastColumnIndex'] = $tmpInfo['totalColumns'] - 1;
$tmpInfo['lastColumnLetter'] = PHPExcel_Cell::stringFromColumnIndex($tmpInfo['lastColumnIndex']);
$tmpInfo['totalColumns'] = $tmpInfo['lastColumnIndex'] + 1;
$worksheetInfo[] = $tmpInfo;
}
@ -282,45 +329,6 @@ class PHPExcel_Reader_Excel2007 extends PHPExcel_Reader_Abstract implements PHPE
}
/**
* Reads names of the worksheets from a file, without parsing the whole file to a PHPExcel object
*
* @param string $pFilename
* @throws PHPExcel_Reader_Exception
*/
public function listWorksheetNames($pFilename)
{
// Check if file exists
if (!file_exists($pFilename)) {
throw new PHPExcel_Reader_Exception("Could not open " . $pFilename . " for reading! File does not exist.");
}
$worksheetNames = array();
$zip = new ZipArchive;
$zip->open($pFilename);
$rels = simplexml_load_string($this->_getFromZipArchive($zip, "_rels/.rels")); //~ http://schemas.openxmlformats.org/package/2006/relationships");
foreach ($rels->Relationship as $rel) {
switch ($rel["Type"]) {
case "http://schemas.openxmlformats.org/officeDocument/2006/relationships/officeDocument":
$xmlWorkbook = simplexml_load_string($this->_getFromZipArchive($zip, "{$rel['Target']}")); //~ http://schemas.openxmlformats.org/spreadsheetml/2006/main");
if ($xmlWorkbook->sheets) {
foreach ($xmlWorkbook->sheets->sheet as $eleSheet) {
// Check if sheet should be skipped
$worksheetNames[] = (string) $eleSheet["name"];
}
}
}
}
$zip->close();
return $worksheetNames;
}
/**
* Loads PHPExcel from file
*

View File

@ -102,6 +102,40 @@ class PHPExcel_Reader_Gnumeric extends PHPExcel_Reader_Abstract implements PHPEx
}
/**
* Reads names of the worksheets from a file, without parsing the whole file to a PHPExcel object
*
* @param string $pFilename
* @throws PHPExcel_Reader_Exception
*/
public function listWorksheetNames($pFilename)
{
// Check if file exists
if (!file_exists($pFilename)) {
throw new PHPExcel_Reader_Exception("Could not open " . $pFilename . " for reading! File does not exist.");
}
$xml = new XMLReader();
$xml->open(
'compress.zlib://'.realpath($pFilename)
);
$xml->setParserProperty(2,true);
$worksheetNames = array();
while ($xml->read()) {
if ($xml->name == 'gnm:SheetName' && $xml->nodeType == XMLReader::ELEMENT) {
$xml->read(); // Move onto the value node
$worksheetNames[] = (string) $xml->value;
} elseif ($xml->name == 'gnm:Sheets') {
// break out of the loop once we've got our sheet names rather than parse the entire file
break;
}
}
return $worksheetNames;
}
/**
* Return worksheet info (Name, Last Column Letter, Last Column Index, Total Rows, Total Columns)
*
@ -115,38 +149,41 @@ class PHPExcel_Reader_Gnumeric extends PHPExcel_Reader_Abstract implements PHPEx
throw new PHPExcel_Reader_Exception("Could not open " . $pFilename . " for reading! File does not exist.");
}
$gFileData = $this->_gzfileGetContents($pFilename);
$xml = simplexml_load_string($gFileData);
$namespacesMeta = $xml->getNamespaces(true);
$gnmXML = $xml->children($namespacesMeta['gnm']);
$xml = new XMLReader();
$xml->open(
'compress.zlib://'.realpath($pFilename)
);
$xml->setParserProperty(2,true);
$worksheetInfo = array();
while ($xml->read()) {
if ($xml->name == 'gnm:Sheet' && $xml->nodeType == XMLReader::ELEMENT) {
$tmpInfo = array(
'worksheetName' => '',
'lastColumnLetter' => 'A',
'lastColumnIndex' => 0,
'totalRows' => 0,
'totalColumns' => 0,
);
foreach ($gnmXML->Sheets->Sheet as $sheet) {
$tmpInfo = array();
$tmpInfo['worksheetName'] = (string) $sheet->Name;
$tmpInfo['lastColumnLetter'] = 'A';
$tmpInfo['lastColumnIndex'] = 0;
$tmpInfo['totalRows'] = 0;
$tmpInfo['totalColumns'] = 0;
foreach ($sheet->Cells->Cell as $cell) {
$cellAttributes = $cell->attributes();
$rowIndex = (int) $cellAttributes->Row + 1;
$columnIndex = (int) $cellAttributes->Col;
$tmpInfo['totalRows'] = max($tmpInfo['totalRows'], $rowIndex);
$tmpInfo['lastColumnIndex'] = max($tmpInfo['lastColumnIndex'], $columnIndex);
while ($xml->read()) {
if ($xml->name == 'gnm:Name' && $xml->nodeType == XMLReader::ELEMENT) {
$xml->read(); // Move onto the value node
$tmpInfo['worksheetName'] = (string) $xml->value;
} elseif ($xml->name == 'gnm:MaxCol' && $xml->nodeType == XMLReader::ELEMENT) {
$xml->read(); // Move onto the value node
$tmpInfo['lastColumnIndex'] = (int) $xml->value;
$tmpInfo['totalColumns'] = (int) $xml->value + 1;
} elseif ($xml->name == 'gnm:MaxRow' && $xml->nodeType == XMLReader::ELEMENT) {
$xml->read(); // Move onto the value node
$tmpInfo['totalRows'] = (int) $xml->value + 1;
break;
}
}
$tmpInfo['lastColumnLetter'] = PHPExcel_Cell::stringFromColumnIndex($tmpInfo['lastColumnIndex']);
$tmpInfo['totalColumns'] = $tmpInfo['lastColumnIndex'] + 1;
$worksheetInfo[] = $tmpInfo;
}
}
return $worksheetInfo;
}
@ -182,36 +219,6 @@ class PHPExcel_Reader_Gnumeric extends PHPExcel_Reader_Abstract implements PHPEx
}
/**
* Reads names of the worksheets from a file, without parsing the whole file to a PHPExcel object
*
* @param string $pFilename
* @throws PHPExcel_Reader_Exception
*/
public function listWorksheetNames($pFilename)
{
// Check if file exists
if (!file_exists($pFilename)) {
throw new PHPExcel_Reader_Exception("Could not open " . $pFilename . " for reading! File does not exist.");
}
$gFileData = $this->_gzfileGetContents($pFilename);
$xml = simplexml_load_string($gFileData);
$namespacesMeta = $xml->getNamespaces(true);
$gnmXML = $xml->children($namespacesMeta['gnm']);
$worksheetNames = array();
foreach($gnmXML->Sheets->Sheet as $sheet) {
$worksheetNames[] = (string) $sheet->Name;
}
return $worksheetNames;
}
/**
* Loads PHPExcel from file into PHPExcel instance
*

View File

@ -124,21 +124,35 @@ class PHPExcel_Reader_OOCalc extends PHPExcel_Reader_Abstract implements PHPExce
throw new PHPExcel_Reader_Exception("Could not open " . $pFilename . " for reading! File does not exist.");
}
$zip = new ZipArchive;
if (!$zip->open($pFilename)) {
throw new PHPExcel_Reader_Exception("Could not open " . $pFilename . " for reading! Error opening file.");
}
$worksheetNames = array();
$zip = new ZipArchive;
if ($zip->open($pFilename) === true) {
$xml = new XMLReader();
$res = $xml->open('zip://'.realpath($pFilename).'#content.xml');
$xml->setParserProperty(2,true);
$xml = simplexml_load_string($zip->getFromName("content.xml"));
$namespacesContent = $xml->getNamespaces(true);
$workbook = $xml->children($namespacesContent['office']);
foreach($workbook->body->spreadsheet as $workbookData) {
$workbookData = $workbookData->children($namespacesContent['table']);
foreach($workbookData->table as $worksheetDataSet) {
$worksheetDataAttributes = $worksheetDataSet->attributes($namespacesContent['table']);
$worksheetNames[] = $worksheetDataAttributes['name'];
// Step into the first level of content of the XML
$xml->read();
while ($xml->read()) {
// Quickly jump through to the office:body node
while ($xml->name !== 'office:body') {
if ($xml->isEmptyElement)
$xml->read();
else
$xml->next();
}
// Now read each node until we find our first table:table node
while ($xml->read()) {
if ($xml->name == 'table:table' && $xml->nodeType == XMLReader::ELEMENT) {
// Loop through each table:table node reading the table:name attribute for each worksheet name
do {
$worksheetNames[] = $xml->getAttribute('table:name');
$xml->next();
} while ($xml->name == 'table:table' && $xml->nodeType == XMLReader::ELEMENT);
}
}
}
@ -147,6 +161,127 @@ class PHPExcel_Reader_OOCalc extends PHPExcel_Reader_Abstract implements PHPExce
}
/**
* Return worksheet info (Name, Last Column Letter, Last Column Index, Total Rows, Total Columns)
*
* @param string $pFilename
* @throws PHPExcel_Reader_Exception
*/
public function listWorksheetInfo($pFilename)
{
// Check if file exists
if (!file_exists($pFilename)) {
throw new PHPExcel_Reader_Exception("Could not open " . $pFilename . " for reading! File does not exist.");
}
$worksheetInfo = array();
$zip = new ZipArchive;
if (!$zip->open($pFilename)) {
throw new PHPExcel_Reader_Exception("Could not open " . $pFilename . " for reading! Error opening file.");
}
$xml = new XMLReader();
$res = $xml->open('zip://'.realpath($pFilename).'#content.xml');
$xml->setParserProperty(2,true);
// Step into the first level of content of the XML
$xml->read();
while ($xml->read()) {
// Quickly jump through to the office:body node
while ($xml->name !== 'office:body') {
if ($xml->isEmptyElement)
$xml->read();
else
$xml->next();
}
// Now read each node until we find our first table:table node
while ($xml->read()) {
if ($xml->name == 'table:table' && $xml->nodeType == XMLReader::ELEMENT) {
$worksheetNames[] = $xml->getAttribute('table:name');
$tmpInfo = array(
'worksheetName' => $xml->getAttribute('table:name'),
'lastColumnLetter' => 'A',
'lastColumnIndex' => 0,
'totalRows' => 0,
'totalColumns' => 0,
);
// Loop through each child node of the table:table element reading
$currCells = 0;
do {
$xml->read();
if ($xml->name == 'table:table-row' && $xml->nodeType == XMLReader::ELEMENT) {
$tmpInfo['totalRows']++;
$tmpInfo['totalColumns'] = max($tmpInfo['totalColumns'],$currCells);
$currCells = 0;
// Step into the row
$xml->read();
do {
if ($xml->name == 'table:table-cell' && $xml->nodeType == XMLReader::ELEMENT) {
if (!$xml->isEmptyElement) {
$currCells++;
$xml->next();
} else {
$xml->read();
}
} elseif ($xml->name == 'table:covered-table-cell' && $xml->nodeType == XMLReader::ELEMENT) {
$mergeSize = $xml->getAttribute('table:number-columns-repeated');
$currCells += $mergeSize;
$xml->read();
}
} while ($xml->name != 'table:table-row');
}
} while ($xml->name != 'table:table');
$tmpInfo['totalColumns'] = max($tmpInfo['totalColumns'],$currCells);
$tmpInfo['lastColumnIndex'] = $tmpInfo['totalColumns'] - 1;
$tmpInfo['lastColumnLetter'] = PHPExcel_Cell::stringFromColumnIndex($tmpInfo['lastColumnIndex']);
$worksheetInfo[] = $tmpInfo;
}
}
// foreach($workbookData->table as $worksheetDataSet) {
// $worksheetData = $worksheetDataSet->children($namespacesContent['table']);
// $worksheetDataAttributes = $worksheetDataSet->attributes($namespacesContent['table']);
//
// $rowIndex = 0;
// foreach ($worksheetData as $key => $rowData) {
// switch ($key) {
// case 'table-row' :
// $rowDataTableAttributes = $rowData->attributes($namespacesContent['table']);
// $rowRepeats = (isset($rowDataTableAttributes['number-rows-repeated'])) ?
// $rowDataTableAttributes['number-rows-repeated'] : 1;
// $columnIndex = 0;
//
// foreach ($rowData as $key => $cellData) {
// $cellDataTableAttributes = $cellData->attributes($namespacesContent['table']);
// $colRepeats = (isset($cellDataTableAttributes['number-columns-repeated'])) ?
// $cellDataTableAttributes['number-columns-repeated'] : 1;
// $cellDataOfficeAttributes = $cellData->attributes($namespacesContent['office']);
// if (isset($cellDataOfficeAttributes['value-type'])) {
// $tmpInfo['lastColumnIndex'] = max($tmpInfo['lastColumnIndex'], $columnIndex + $colRepeats - 1);
// $tmpInfo['totalRows'] = max($tmpInfo['totalRows'], $rowIndex + $rowRepeats);
// }
// $columnIndex += $colRepeats;
// }
// $rowIndex += $rowRepeats;
// break;
// }
// }
//
// $tmpInfo['lastColumnLetter'] = PHPExcel_Cell::stringFromColumnIndex($tmpInfo['lastColumnIndex']);
// $tmpInfo['totalColumns'] = $tmpInfo['lastColumnIndex'] + 1;
//
// }
// }
}
return $worksheetInfo;
}
/**
* Loads PHPExcel from file
*
@ -176,78 +311,6 @@ class PHPExcel_Reader_OOCalc extends PHPExcel_Reader_Abstract implements PHPExce
}
/**
* Return worksheet info (Name, Last Column Letter, Last Column Index, Total Rows, Total Columns)
*
* @param string $pFilename
* @throws PHPExcel_Reader_Exception
*/
public function listWorksheetInfo($pFilename)
{
// Check if file exists
if (!file_exists($pFilename)) {
throw new PHPExcel_Reader_Exception("Could not open " . $pFilename . " for reading! File does not exist.");
}
$worksheetInfo = array();
$zip = new ZipArchive;
if ($zip->open($pFilename) === true) {
$xml = simplexml_load_string($zip->getFromName("content.xml"));
$namespacesContent = $xml->getNamespaces(true);
$workbook = $xml->children($namespacesContent['office']);
foreach($workbook->body->spreadsheet as $workbookData) {
$workbookData = $workbookData->children($namespacesContent['table']);
foreach($workbookData->table as $worksheetDataSet) {
$worksheetData = $worksheetDataSet->children($namespacesContent['table']);
$worksheetDataAttributes = $worksheetDataSet->attributes($namespacesContent['table']);
$tmpInfo = array();
$tmpInfo['worksheetName'] = (string) $worksheetDataAttributes['name'];
$tmpInfo['lastColumnLetter'] = 'A';
$tmpInfo['lastColumnIndex'] = 0;
$tmpInfo['totalRows'] = 0;
$tmpInfo['totalColumns'] = 0;
$rowIndex = 0;
foreach ($worksheetData as $key => $rowData) {
switch ($key) {
case 'table-row' :
$rowDataTableAttributes = $rowData->attributes($namespacesContent['table']);
$rowRepeats = (isset($rowDataTableAttributes['number-rows-repeated'])) ?
$rowDataTableAttributes['number-rows-repeated'] : 1;
$columnIndex = 0;
foreach ($rowData as $key => $cellData) {
$cellDataTableAttributes = $cellData->attributes($namespacesContent['table']);
$colRepeats = (isset($cellDataTableAttributes['number-columns-repeated'])) ?
$cellDataTableAttributes['number-columns-repeated'] : 1;
$cellDataOfficeAttributes = $cellData->attributes($namespacesContent['office']);
if (isset($cellDataOfficeAttributes['value-type'])) {
$tmpInfo['lastColumnIndex'] = max($tmpInfo['lastColumnIndex'], $columnIndex + $colRepeats - 1);
$tmpInfo['totalRows'] = max($tmpInfo['totalRows'], $rowIndex + $rowRepeats);
}
$columnIndex += $colRepeats;
}
$rowIndex += $rowRepeats;
break;
}
}
$tmpInfo['lastColumnLetter'] = PHPExcel_Cell::stringFromColumnIndex($tmpInfo['lastColumnIndex']);
$tmpInfo['totalColumns'] = $tmpInfo['lastColumnIndex'] + 1;
$worksheetInfo[] = $tmpInfo;
}
}
}
return $worksheetInfo;
}
/**
* Loads PHPExcel from file into PHPExcel instance
*
@ -267,7 +330,10 @@ class PHPExcel_Reader_OOCalc extends PHPExcel_Reader_Abstract implements PHPExce
$GMT = new DateTimeZone('UTC');
$zip = new ZipArchive;
if ($zip->open($pFilename) === true) {
if (!$zip->open($pFilename)) {
throw new PHPExcel_Reader_Exception("Could not open " . $pFilename . " for reading! Error opening file.");
}
// echo '<h1>Meta Information</h1>';
$xml = simplexml_load_string($zip->getFromName("meta.xml"));
$namespacesMeta = $xml->getNamespaces(true);
@ -611,8 +677,6 @@ class PHPExcel_Reader_OOCalc extends PHPExcel_Reader_Abstract implements PHPExce
}
}
}
// Return
return $objPHPExcel;
}

View File

@ -100,6 +100,24 @@ class PHPExcel_Shared_ZipStreamWrapper {
return true;
}
/**
* Implements support for fstat().
*
* @return boolean
*/
public function statName() {
return $this->_fileNameInArchive;
}
/**
* Implements support for fstat().
*
* @return boolean
*/
public function url_stat() {
return $this->statName( $this->_fileNameInArchive );
}
/**
* Implements support for fstat().
*