nodeType===XML_TEXT_NODE) { $nextNode=$node->nextSibling; if (!$nextNode) { break; } $node=$nextNode; } } private function jumpToPreviousTag(&$node) { while ($node && $node->nodeType===XML_TEXT_NODE) { $previousNode=$node->previousSibling; if (!$previousNode) { break; } $node=$previousNode; } } public function collectData() { // Because the LWN page is written in loose HTML and not XHTML, // Simple HTML Dom is not accurate enough for the job $content=getContents($this->getURI()) or returnServerError('No results for LWNprev'); libxml_use_internal_errors(true); $html=new DOMDocument(); $html->loadHTML($content); libxml_clear_errors(); $cat1=''; $cat2=''; foreach ($html->getElementsByTagName('a') as $a) { if ($a->textContent==='Multi-page format') { break; } } $realURI=self::URI.$a->getAttribute('href'); $URICounter=0; $edition=$html->getElementsByTagName('h1')->item(0)->textContent; $editionTimeStamp=strtotime( substr($edition, strpos($edition, 'for ')+strlen('for ')) ); foreach ($html->getElementsByTagName('h2') as $h2) { if ($h2->getAttribute('class')!=='SummaryHL') { continue; } $item = array(); $h2NextSibling=$h2->nextSibling; $this->jumpToNextTag($h2NextSibling); switch ($h2NextSibling->getAttribute('class')) { case 'FeatureByline': $item['author']=$h2NextSibling->getElementsByTagName('b')->item(0)->textContent; break; case 'GAByline': $text=$h2NextSibling->textContent; $item['author']=substr($text, strpos($text, 'by ')); break; default: $item['author']='LWN'; break; }; $h2FirstChild=$h2->firstChild; $this->jumpToNextTag($h2FirstChild); if ($h2FirstChild->nodeName==='a') { $item['uri']=self::URI.$h2FirstChild->getAttribute('href'); } else { $item['uri']=$realURI.'#'.$URICounter; } $URICounter++; $item['timestamp']=$editionTimeStamp+$URICounter; $h2PrevSibling=$h2->previousSibling; $this->jumpToPreviousTag($h2PrevSibling); switch ($h2PrevSibling->getAttribute('class')) { case 'Cat2HL': $cat2=$h2PrevSibling->textContent; $h2PrevSibling=$h2PrevSibling->previousSibling; $this->jumpToPreviousTag($h2PrevSibling); if ($h2PrevSibling->getAttribute('class')!=='Cat1HL') { break; } $cat1=$h2PrevSibling->textContent; break; case 'Cat1HL': $cat1=$h2PrevSibling->textContent; $cat2=''; break; default: break; } $h2PrevSibling=null; $item['title']=''; if (!empty($cat1)) { $item['title'].='['.$cat1.($cat2?'/'.$cat2:'').'] '; } $item['title'].=$h2->textContent; $node=$h2; $content=''; $contentEnd=false; while (!$contentEnd) { $node=$node->nextSibling; if ( !$node || ( $node->nodeType!==XML_TEXT_NODE && ( $node->nodeName==='h2' || (!is_null($node->attributes) && !is_null($class=$node->attributes->getNamedItem('class')) && in_array($class->nodeValue, array('Cat1HL','Cat2HL'))) ) ) ) { $contentEnd=true; } else { $content.=$node->C14N(); } } $item['content']=$content; $this->items[]=$item; } } }