ILIAS  release_5-2 Revision v5.2.25-18-g3f80b82851
HTML.php
Go to the documentation of this file.
1 <?php
2 
29 if (!defined('PHPEXCEL_ROOT')) {
33  define('PHPEXCEL_ROOT', dirname(__FILE__) . '/../../');
34  require(PHPEXCEL_ROOT . 'PHPExcel/Autoloader.php');
35 }
36 
45 {
46 
52  protected $_inputEncoding = 'ANSI';
53 
59  protected $_sheetIndex = 0;
60 
66  protected $_formats = array(
67  'h1' => array('font' => array('bold' => true,
68  'size' => 24,
69  ),
70  ), // Bold, 24pt
71  'h2' => array('font' => array('bold' => true,
72  'size' => 18,
73  ),
74  ), // Bold, 18pt
75  'h3' => array('font' => array('bold' => true,
76  'size' => 13.5,
77  ),
78  ), // Bold, 13.5pt
79  'h4' => array('font' => array('bold' => true,
80  'size' => 12,
81  ),
82  ), // Bold, 12pt
83  'h5' => array('font' => array('bold' => true,
84  'size' => 10,
85  ),
86  ), // Bold, 10pt
87  'h6' => array('font' => array('bold' => true,
88  'size' => 7.5,
89  ),
90  ), // Bold, 7.5pt
91  'a' => array('font' => array('underline' => true,
92  'color' => array('argb' => PHPExcel_Style_Color::COLOR_BLUE,
93  ),
94  ),
95  ), // Blue underlined
96  'hr' => array('borders' => array('bottom' => array('style' => PHPExcel_Style_Border::BORDER_THIN,
98  ),
99  ),
100  ),
101  ), // Bottom border
102  );
103 
104  protected $rowspan = array();
105 
109  public function __construct()
110  {
111  $this->_readFilter = new PHPExcel_Reader_DefaultReadFilter();
112  }
113 
119  protected function _isValidFormat()
120  {
121  // Reading 2048 bytes should be enough to validate that the format is HTML
122  $data = fread($this->_fileHandle, 2048);
123  if ((strpos($data, '<') !== FALSE) &&
124  (strlen($data) !== strlen(strip_tags($data)))) {
125  return TRUE;
126  }
127 
128  return FALSE;
129  }
130 
138  public function load($pFilename)
139  {
140  // Create new PHPExcel
141  $objPHPExcel = new PHPExcel();
142 
143  // Load into this instance
144  return $this->loadIntoExisting($pFilename, $objPHPExcel);
145  }
146 
152  public function setInputEncoding($pValue = 'ANSI')
153  {
154  $this->_inputEncoding = $pValue;
155 
156  return $this;
157  }
158 
164  public function getInputEncoding()
165  {
166  return $this->_inputEncoding;
167  }
168 
169  // Data Array used for testing only, should write to PHPExcel object on completion of tests
170  protected $_dataArray = array();
171  protected $_tableLevel = 0;
172  protected $_nestedColumn = array('A');
173 
174  protected function _setTableStartColumn($column)
175  {
176  if ($this->_tableLevel == 0)
177  $column = 'A';
179  $this->_nestedColumn[$this->_tableLevel] = $column;
180 
181  return $this->_nestedColumn[$this->_tableLevel];
182  }
183 
184  protected function _getTableStartColumn()
185  {
186  return $this->_nestedColumn[$this->_tableLevel];
187  }
188 
189  protected function _releaseTableStartColumn()
190  {
192 
193  return array_pop($this->_nestedColumn);
194  }
195 
196  protected function _flushCell($sheet, $column, $row, &$cellContent)
197  {
198  if (is_string($cellContent)) {
199  // Simple String content
200  if (trim($cellContent) > '') {
201  // Only actually write it if there's content in the string
202 // echo 'FLUSH CELL: ' , $column , $row , ' => ' , $cellContent , '<br />';
203  // Write to worksheet to be done here...
204  // ... we return the cell so we can mess about with styles more easily
205  $sheet->setCellValue($column . $row, $cellContent, true);
206  $this->_dataArray[$row][$column] = $cellContent;
207  }
208  } else {
209  // We have a Rich Text run
210  // TODO
211  $this->_dataArray[$row][$column] = 'RICH TEXT: ' . $cellContent;
212  }
213  $cellContent = (string) '';
214  }
215 
216  protected function _processDomElement(DOMNode $element, $sheet, &$row, &$column, &$cellContent, $format = null)
217  {
218  foreach ($element->childNodes as $child) {
219  if ($child instanceof DOMText) {
220  $domText = preg_replace('/\s+/u', ' ', trim($child->nodeValue));
221  if (is_string($cellContent)) {
222  // simply append the text if the cell content is a plain text string
223  $cellContent .= $domText;
224  } else {
225  // but if we have a rich text run instead, we need to append it correctly
226  // TODO
227  }
228  } elseif ($child instanceof DOMElement) {
229 // echo '<b>DOM ELEMENT: </b>' , strtoupper($child->nodeName) , '<br />';
230 
231  $attributeArray = array();
232  foreach ($child->attributes as $attribute) {
233 // echo '<b>ATTRIBUTE: </b>' , $attribute->name , ' => ' , $attribute->value , '<br />';
234  $attributeArray[$attribute->name] = $attribute->value;
235  }
236 
237  switch ($child->nodeName) {
238  case 'meta' :
239  foreach ($attributeArray as $attributeName => $attributeValue) {
240  switch ($attributeName) {
241  case 'content':
242  // TODO
243  // Extract character set, so we can convert to UTF-8 if required
244  break;
245  }
246  }
247  $this->_processDomElement($child, $sheet, $row, $column, $cellContent);
248  break;
249  case 'title' :
250  $this->_processDomElement($child, $sheet, $row, $column, $cellContent);
251  $sheet->setTitle($cellContent);
252  $cellContent = '';
253  break;
254  case 'span' :
255  case 'div' :
256  case 'font' :
257  case 'i' :
258  case 'em' :
259  case 'strong':
260  case 'b' :
261 // echo 'STYLING, SPAN OR DIV<br />';
262  if ($cellContent > '')
263  $cellContent .= ' ';
264  $this->_processDomElement($child, $sheet, $row, $column, $cellContent);
265  if ($cellContent > '')
266  $cellContent .= ' ';
267 // echo 'END OF STYLING, SPAN OR DIV<br />';
268  break;
269  case 'hr' :
270  $this->_flushCell($sheet, $column, $row, $cellContent);
271  ++$row;
272  if (isset($this->_formats[$child->nodeName])) {
273  $sheet->getStyle($column . $row)->applyFromArray($this->_formats[$child->nodeName]);
274  } else {
275  $cellContent = '----------';
276  $this->_flushCell($sheet, $column, $row, $cellContent);
277  }
278  ++$row;
279  case 'br' :
280  if ($this->_tableLevel > 0) {
281  // If we're inside a table, replace with a \n
282  $cellContent .= "\n";
283  } else {
284  // Otherwise flush our existing content and move the row cursor on
285  $this->_flushCell($sheet, $column, $row, $cellContent);
286  ++$row;
287  }
288 // echo 'HARD LINE BREAK: ' , '<br />';
289  break;
290  case 'a' :
291 // echo 'START OF HYPERLINK: ' , '<br />';
292  foreach ($attributeArray as $attributeName => $attributeValue) {
293  switch ($attributeName) {
294  case 'href':
295 // echo 'Link to ' , $attributeValue , '<br />';
296  $sheet->getCell($column . $row)->getHyperlink()->setUrl($attributeValue);
297  if (isset($this->_formats[$child->nodeName])) {
298  $sheet->getStyle($column . $row)->applyFromArray($this->_formats[$child->nodeName]);
299  }
300  break;
301  }
302  }
303  $cellContent .= ' ';
304  $this->_processDomElement($child, $sheet, $row, $column, $cellContent);
305 // echo 'END OF HYPERLINK:' , '<br />';
306  break;
307  case 'h1' :
308  case 'h2' :
309  case 'h3' :
310  case 'h4' :
311  case 'h5' :
312  case 'h6' :
313  case 'ol' :
314  case 'ul' :
315  case 'p' :
316  if ($this->_tableLevel > 0) {
317  // If we're inside a table, replace with a \n
318  $cellContent .= "\n";
319 // echo 'LIST ENTRY: ' , '<br />';
320  $this->_processDomElement($child, $sheet, $row, $column, $cellContent);
321 // echo 'END OF LIST ENTRY:' , '<br />';
322  } else {
323  if ($cellContent > '') {
324  $this->_flushCell($sheet, $column, $row, $cellContent);
325  $row++;
326  }
327 // echo 'START OF PARAGRAPH: ' , '<br />';
328  $this->_processDomElement($child, $sheet, $row, $column, $cellContent);
329 // echo 'END OF PARAGRAPH:' , '<br />';
330  $this->_flushCell($sheet, $column, $row, $cellContent);
331 
332  if (isset($this->_formats[$child->nodeName])) {
333  $sheet->getStyle($column . $row)->applyFromArray($this->_formats[$child->nodeName]);
334  }
335 
336  $row++;
337  $column = 'A';
338  }
339  break;
340  case 'li' :
341  if ($this->_tableLevel > 0) {
342  // If we're inside a table, replace with a \n
343  $cellContent .= "\n";
344 // echo 'LIST ENTRY: ' , '<br />';
345  $this->_processDomElement($child, $sheet, $row, $column, $cellContent);
346 // echo 'END OF LIST ENTRY:' , '<br />';
347  } else {
348  if ($cellContent > '') {
349  $this->_flushCell($sheet, $column, $row, $cellContent);
350  }
351  ++$row;
352 // echo 'LIST ENTRY: ' , '<br />';
353  $this->_processDomElement($child, $sheet, $row, $column, $cellContent);
354 // echo 'END OF LIST ENTRY:' , '<br />';
355  $this->_flushCell($sheet, $column, $row, $cellContent);
356  $column = 'A';
357  }
358  break;
359  case 'table' :
360  $this->_flushCell($sheet, $column, $row, $cellContent);
362 // echo 'START OF TABLE LEVEL ' , $this->_tableLevel , '<br />';
363  if ($this->_tableLevel > 1)
364  --$row;
365  $this->_processDomElement($child, $sheet, $row, $column, $cellContent);
366 // echo 'END OF TABLE LEVEL ' , $this->_tableLevel , '<br />';
368  if ($this->_tableLevel > 1) {
369  ++$column;
370  } else {
371  ++$row;
372  }
373  break;
374  case 'thead' :
375  case 'tbody' :
376  $this->_processDomElement($child, $sheet, $row, $column, $cellContent);
377  break;
378  case 'tr' :
379  $column = $this->_getTableStartColumn();
380  $cellContent = '';
381 // echo 'START OF TABLE ' , $this->_tableLevel , ' ROW<br />';
382  $this->_processDomElement($child, $sheet, $row, $column, $cellContent);
383  ++$row;
384 // echo 'END OF TABLE ' , $this->_tableLevel , ' ROW<br />';
385  break;
386  case 'th' :
387  case 'td' :
388 // echo 'START OF TABLE ' , $this->_tableLevel , ' CELL<br />';
389  $this->_processDomElement($child, $sheet, $row, $column, $cellContent);
390 // echo 'END OF TABLE ' , $this->_tableLevel , ' CELL<br />';
391 
392  while (isset($this->rowspan[$column . $row])) {
393  ++$column;
394  }
395 
396  $this->_flushCell($sheet, $column, $row, $cellContent);
397 
398 // if (isset($attributeArray['style']) && !empty($attributeArray['style'])) {
399 // $styleAry = $this->getPhpExcelStyleArray($attributeArray['style']);
400 //
401 // if (!empty($styleAry)) {
402 // $sheet->getStyle($column . $row)->applyFromArray($styleAry);
403 // }
404 // }
405 
406  if (isset($attributeArray['rowspan']) && isset($attributeArray['colspan'])) {
407  //create merging rowspan and colspan
408  $columnTo = $column;
409  for ($i = 0; $i < $attributeArray['colspan'] - 1; $i++) {
410  ++$columnTo;
411  }
412  $range = $column . $row . ':' . $columnTo . ($row + $attributeArray['rowspan'] - 1);
413  foreach (\PHPExcel_Cell::extractAllCellReferencesInRange($range) as $value) {
414  $this->rowspan[$value] = true;
415  }
416  $sheet->mergeCells($range);
417  $column = $columnTo;
418  } elseif (isset($attributeArray['rowspan'])) {
419  //create merging rowspan
420  $range = $column . $row . ':' . $column . ($row + $attributeArray['rowspan'] - 1);
421  foreach (\PHPExcel_Cell::extractAllCellReferencesInRange($range) as $value) {
422  $this->rowspan[$value] = true;
423  }
424  $sheet->mergeCells($range);
425  } elseif (isset($attributeArray['colspan'])) {
426  //create merging colspan
427  $columnTo = $column;
428  for ($i = 0; $i < $attributeArray['colspan'] - 1; $i++) {
429  ++$columnTo;
430  }
431  $sheet->mergeCells($column . $row . ':' . $columnTo . $row);
432  $column = $columnTo;
433  }
434  ++$column;
435  break;
436  case 'body' :
437  $row = 1;
438  $column = 'A';
439  $content = '';
440  $this->_tableLevel = 0;
441  $this->_processDomElement($child, $sheet, $row, $column, $cellContent);
442  break;
443  default:
444  $this->_processDomElement($child, $sheet, $row, $column, $cellContent);
445  }
446  }
447  }
448  }
449 
458  public function loadIntoExisting($pFilename, PHPExcel $objPHPExcel)
459  {
460  // Open file to validate
461  $this->_openFile($pFilename);
462  if (!$this->_isValidFormat()) {
463  fclose($this->_fileHandle);
464  throw new PHPExcel_Reader_Exception($pFilename . " is an Invalid HTML file.");
465  }
466  // Close after validating
467  fclose($this->_fileHandle);
468 
469  // Create new PHPExcel
470  while ($objPHPExcel->getSheetCount() <= $this->_sheetIndex) {
471  $objPHPExcel->createSheet();
472  }
473  $objPHPExcel->setActiveSheetIndex($this->_sheetIndex);
474 
475  // Create a new DOM object
476  $dom = new domDocument;
477  // Reload the HTML file into the DOM object
478  $loaded = $dom->loadHTML($this->securityScanFile($pFilename));
479  if ($loaded === FALSE) {
480  throw new PHPExcel_Reader_Exception('Failed to load ', $pFilename, ' as a DOM Document');
481  }
482 
483  // Discard white space
484  $dom->preserveWhiteSpace = false;
485 
486  $row = 0;
487  $column = 'A';
488  $content = '';
489  $this->_processDomElement($dom, $objPHPExcel->getActiveSheet(), $row, $column, $content);
490 
491  // Return
492  return $objPHPExcel;
493  }
494 
500  public function getSheetIndex()
501  {
502  return $this->_sheetIndex;
503  }
504 
511  public function setSheetIndex($pValue = 0)
512  {
513  $this->_sheetIndex = $pValue;
514 
515  return $this;
516  }
517 
524  public function securityScan($xml)
525  {
526  $pattern = '/\\0?' . implode('\\0?', str_split('<!ENTITY')) . '\\0?/';
527  if (preg_match($pattern, $xml)) {
528  throw new PHPExcel_Reader_Exception('Detected use of ENTITY in XML, spreadsheet file load() aborted to prevent XXE/XEE attacks');
529  }
530  return $xml;
531  }
532 
533 }
534 
_releaseTableStartColumn()
Definition: HTML.php:189
_flushCell($sheet, $column, $row, &$cellContent)
Definition: HTML.php:196
setSheetIndex($pValue=0)
Set sheet index.
Definition: HTML.php:511
getSheetCount()
Get sheet count.
Definition: PHPExcel.php:661
$objPHPExcel
_isValidFormat()
Validate that the current file is an HTML file.
Definition: HTML.php:119
Add rich text string
The name of the decorator.
createSheet($iSheetIndex=NULL)
Create sheet and add it to this workbook.
Definition: PHPExcel.php:479
securityScanFile($filestream)
Scan theXML for use of <!ENTITY to prevent XXE/XEE attacks.
Definition: Abstract.php:251
$column
Definition: 39dropdown.php:62
load($pFilename)
Loads PHPExcel from file.
Definition: HTML.php:138
__construct()
Create a new PHPExcel_Reader_HTML.
Definition: HTML.php:109
getInputEncoding()
Get input encoding.
Definition: HTML.php:164
Create styles array
The data for the language used.
securityScan($xml)
Scan theXML for use of <!ENTITY to prevent XXE/XEE attacks.
Definition: HTML.php:524
getActiveSheet()
Get active sheet.
Definition: PHPExcel.php:467
static extractAllCellReferencesInRange($pRange='A1')
Extract all cell references in range.
Definition: Cell.php:854
setActiveSheetIndex($pIndex=0)
Set active sheet index.
Definition: PHPExcel.php:683
setInputEncoding($pValue='ANSI')
Set input encoding.
Definition: HTML.php:152
getSheetIndex()
Get sheet index.
Definition: HTML.php:500
_processDomElement(DOMNode $element, $sheet, &$row, &$column, &$cellContent, $format=null)
Definition: HTML.php:216
_openFile($pFilename)
Open file for reading.
Definition: Abstract.php:195
defined( 'APPLICATION_ENV')||define( 'APPLICATION_ENV'
Definition: bootstrap.php:27
_setTableStartColumn($column)
Definition: HTML.php:174
loadIntoExisting($pFilename, PHPExcel $objPHPExcel)
Loads PHPExcel from file into PHPExcel instance.
Definition: HTML.php:458