Parsing an HTML Table with PEAR’s XML_HTMLSax3

Here’s an example of how to parse an HTML table into an array using the PEAR module XML_HTMLSax3. It supports the

,

and

elements and the rowspan and colspan attributes.

It’s worth noting that this code will raise a bunch of notices if you run it displaying all errors. This is pretty difficult to avoid, so if you don’t like that, disable the display of notices.

.
*
* @author Toby Inkster
* @copyright Copyright (C) 2007 Toby Inkster
* @license http://www.gnu.org/licenses/gpl-3.0.html GNU General Public Licence
*/

/**
* Parser class.
*
* You probably only need to directly access the “Go” method.
*/
class TableParser
{
private $currow = -1;
private $curcol = -1;

private $shape = array();
private $data = array();

public function openHandler ($parser, $tag, $attrs)
{
$tag = strtolower($tag);

// Move to the correct cell co-ordinates.
if ($tag==’tr’)
{
$this->currow++;
$this->curcol = -1;
}
elseif ($tag==’td’||$tag==’th’)
{
$this->curcol++;
}

// This should account for rowspan and colspan.
while ($this->shape[$this->currow][$this->curcol])
$this->curcol++;
$rowspan = 1;
$colspan = 1;
foreach ($attrs as $k=>$v)
{
$k = strtolower($k);
if ($k==’rowspan’)
$rowspan=(int)$v;
elseif ($k==’colspan’)
$colspan=(int)$v;
}
for ($i=0; $i<$rowspan; $i++) for ($j=0; $j<$colspan; $j++) { $x = $this->currow + $i;
$y = $this->curcol + $j;
if ($this->shape[$x][$y])
error_log(‘Overlap!’);
$this->shape[$x][$y] = TRUE;
}
}

public function closeHandler ($parser, $tag)
{
}

public function dataHandler ($parser, $data)
{
$this->data[$this->currow][$this->curcol] .= $data;
}

public function getData ()
{
unset($this->data[-1]);
foreach ($this->data as $k=>$v)
unset($this->data[$k][-1]);
return $this->data;
}

public static function Go ($table_html)
{
require_once ‘XML/HTMLSax3.php’;
$sax = new XML_HTMLSax3;
$hdlr = new TableParser;
$sax->set_object($hdlr);
$sax->set_element_handler(‘openHandler’, ‘closeHandler’);
$sax->set_data_handler(‘dataHandler’);
$sax->parse($table_html);
return $hdlr->getData();
}

}

$table = ‘

Test table lalala 123 456
789 ABC
123 456
789

‘;
print_r(TableParser::Go($table));

1?>