Parsing an HTML Table with PEAR’s XML_HTMLSax3

This is a very old article. It has been imported from older blogging software, and the formatting, images, etc may have been lost. Some links may be broken. Some of the information may no longer be correct. Opinions expressed in this article may no longer be held.

Here’s an example of how to parse an HTML table into an array using the PEAR module XML_HTMLSax3. It supports the

,

and

elements and the rowspan and colspan attributes.

It’s worth noting that this code will raise a bunch of notices if you run it displaying all errors. This is pretty difficult to avoid, so if you don’t like that, disable the display of notices.

.
*
* @author Toby Inkster
* @copyright Copyright (C) 2007 Toby Inkster
* @license http://www.gnu.org/licenses/gpl-3.0.html GNU General Public Licence
*/

/**
* Parser class.
*
* You probably only need to directly access the “Go” method.
*/
class TableParser
{
private $currow = -1;
private $curcol = -1;

private $shape = array();
private $data = array();

public function openHandler ($parser, $tag, $attrs)
{
$tag = strtolower($tag);

// Move to the correct cell co-ordinates.
if ($tag==’tr’)
{
$this->currow++;
$this->curcol = -1;
}
elseif ($tag==’td’||$tag==’th’)
{
$this->curcol++;
}

// This should account for rowspan and colspan.
while ($this->shape[$this->currow][$this->curcol])
$this->curcol++;
$rowspan = 1;
$colspan = 1;
foreach ($attrs as $k=>$v)
{
$k = strtolower($k);
if ($k==’rowspan’)
$rowspan=(int)$v;
elseif ($k==’colspan’)
$colspan=(int)$v;
}
for ($i=0; $i<$rowspan; $i++) for ($j=0; $j<$colspan; $j++) { $x = $this->currow + $i;
$y = $this->curcol + $j;
if ($this->shape[$x][$y])
error_log(‘Overlap!’);
$this->shape[$x][$y] = TRUE;
}
}

public function closeHandler ($parser, $tag)
{
}

public function dataHandler ($parser, $data)
{
$this->data[$this->currow][$this->curcol] .= $data;
}

public function getData ()
{
unset($this->data[-1]);
foreach ($this->data as $k=>$v)
unset($this->data[$k][-1]);
return $this->data;
}

public static function Go ($table_html)
{
require_once ‘XML/HTMLSax3.php’;
$sax = new XML_HTMLSax3;
$hdlr = new TableParser;
$sax->set_object($hdlr);
$sax->set_element_handler(‘openHandler’, ‘closeHandler’);
$sax->set_data_handler(‘dataHandler’);
$sax->parse($table_html);
return $hdlr->getData();
}

}

$table = ‘

Test table lalala 123 456
789 ABC
123 456
789

‘;
print_r(TableParser::Go($table));

1?>