Parsing an HTML Table with PEAR’s XML_HTMLSax3

Here’s an example of how to parse an HTML table into an array using the PEAR module XML_HTMLSax3. It supports the , and elements and the rowspan and colspan attributes. It’s worth noting that this code will raise a bunch of notices if you run it displaying all errors. This is pretty difficult to avoid, so if you don’t like that, disable the display of notices. . * * @author Toby Inkster * @copyright Copyright (C) 2007 Toby Inkster * @license http://www.gnu.org/licenses/gpl-3.0.html GNU General Public Licence */ /** * Parser class. * * You probably only need to directly access the “Go” method. */ class TableParser { private $currow = -1; private $curcol = -1; private $shape = array(); private $data = array(); public function openHandler ($parser, $tag, $attrs) { $tag = strtolower($tag); // Move to the correct cell co-ordinates. if ($tag==’tr’) { $this->currow++; $this->curcol = -1; } elseif ($tag==’td’||$tag==’th’) { $this->curcol++; } // This should account for rowspan and colspan. while ($this->shape[$this->currow][$this->curcol]) $this->curcol++; $rowspan = 1; $colspan = 1; foreach ($attrs as $k=>$v) { $k = strtolower($k); if ($k==’rowspan’) $rowspan=(int)$v; elseif ($k==’colspan’) $colspan=(int)$v; } for ($i=0; $i<$rowspan; $i++) for ($j=0; $j<$colspan; $j++) { $x = $this->currow + $i; $y = $this->curcol + $j; if ($this->shape[$x][$y]) error_log(‘Overlap!’); $this->shape[$x][$y] = TRUE; } } public function closeHandler ($parser, $tag) { } public function dataHandler ($parser, $data) { $this->data[$this->currow][$this->curcol] .= $data; } public function getData () { unset($this->data[-1]); foreach ($this->data as $k=>$v) unset($this->data[$k][-1]); return $this->data; } public static function Go ($table_html) { require_once ‘XML/HTMLSax3.php’; $sax = new XML_HTMLSax3; $hdlr = new TableParser; $sax->set_object($hdlr); $sax->set_element_handler(‘openHandler’, ‘closeHandler’); $sax->set_data_handler(‘dataHandler’); $sax->parse($table_html); return $hdlr->getData(); } } $table = ‘
Test table lalala 123 456
789 ABC
123 456
789
‘; print_r(TableParser::Go($table)); 1?>