11class WebScraper {
2- public $obj, $dom, $xpath;
2+ public $obj, $ishtml = null, $query, $ dom, $xpath;
33
4- public function __construct($param) {
5-
4+ public function __construct() {
65 $this->dom = new DOMDocument();
7- libxml_use_internal_errors(true);
6+ }
87
9- if(filter_var($param, FILTER_VALIDATE_URL)){
10- $this->dom->loadHTMLFile($param, LIBXML_HTML_NOIMPLIED | LIBXML_HTML_NODEFDTD);
11- } else {
12- $this->dom->loadXML($param);
13- }
14-
8+ public function loadHTMLFile($url){
9+ libxml_use_internal_errors(true);
10+ $this->dom->loadHTMLFile($url, LIBXML_HTML_NOIMPLIED | LIBXML_HTML_NODEFDTD);
1511 libxml_use_internal_errors(false);
1612 $this->xpath = new DOMXPath($this->dom);
13+ $this->ishtml = true;
14+ }
15+
16+ public function loadXML($XML){
17+ $this->dom->loadXML($XML);
18+ $this->xpath = new DOMXPath($this->dom);
19+ $this->ishtml = false;
1720 }
1821
22+ public function loadHTML($HTML){
23+ $this->dom->loadHTML($HTML);
24+ $this->xpath = new DOMXPath($this->dom);
25+ $this->ishtml = true;
26+ }
27+
1928 private function convert2XPath($query){
2029 $xpath = $query;
2130
@@ -94,7 +103,16 @@ private function convert2XPath($query){
94103 return $xpath;
95104 }
96105
97- public function Q($query, $root = false){
106+ public function Q($query){
107+ $this->query = $query;
108+ $query = $this->convert2XPath($query);
109+
110+ $this->obj = $this->xpath->query("//$query");
111+
112+ return $this;
113+ }
114+
115+ private function _($query){
98116 $query = $this->convert2XPath($query);
99117
100118 $this->obj = $this->xpath->query("//$query");
@@ -103,6 +121,7 @@ public function Q($query, $root = false){
103121 }
104122
105123 public function query($query){
124+ $this->query = $query;
106125 $query = $this->convert2XPath($query);
107126
108127 $this->obj = $this->xpath->query("//$query");
@@ -117,6 +136,13 @@ public function setAttribute($attr, $value){
117136 $this->obj = null;
118137 }
119138
139+ public function removeAttribute($attr){
140+ foreach ($this->obj as $item){
141+ $item->removeAttribute("$attr");
142+ }
143+ $this->obj = null;
144+ }
145+
120146 public function addClass($class){
121147 foreach ($this->obj as $item){
122148 $otherClasses = $item->getAttribute("class");
@@ -182,20 +208,32 @@ public function appendHtml($html){
182208 $this->obj = null;
183209 }
184210
211+ public function prependHtml($html){
212+
213+ $dom = new DOMDocument();
214+ $dom->loadXML($html);
215+ $xpath = new DOMXPath($dom);
216+
217+ foreach($this->obj as $item){
218+ foreach($xpath->query("//*") as $contentNode){
219+ $contentNode = $this->dom->importNode($contentNode, true);
220+ $item->insertBefore($contentNode, $item->firstChild);
221+ }
222+ }
223+
224+ $this->obj = null;
225+ }
226+
185227 public function delete($keepinner = false){
186228
187229 foreach($this->obj as $item){
188- if (!$item->parentNode->hasAttribute($item->nodeName)){
189- if (!$keepinner){
190- $item->parentNode->removeChild($item);
191- } else {
192- while ($item->firstChild instanceof DOMNode) {
193- $item->parentNode->insertBefore($item->firstChild, $item);
194- }
195- $item->parentNode->removeChild($item);
196- }
230+ if (!$keepinner){
231+ $item->parentNode->removeChild($item);
197232 } else {
198- $item->parentNode->removeAttribute($item->nodeName);
233+ while ($item->firstChild instanceof DOMNode) {
234+ $item->parentNode->insertBefore($item->firstChild, $item);
235+ }
236+ $item->parentNode->removeChild($item);
199237 }
200238 }
201239
@@ -212,9 +250,8 @@ public function unwrap(){
212250 $this->obj = null;
213251 }
214252
215- public function wrap($html){
216- $tag = preg_replace("/<([\w\d]+).*><\/\\1>/", "$1", $html);
217-
253+ private function breakUp($tag, &$html, &$keys, &$vals, &$attrs){
254+
218255 $html = preg_replace_callback(
219256 '/([^=<>\s]*)=[\'|"]([^=]*)[\'|"]/',
220257 function($m){
@@ -228,8 +265,6 @@ function($m){
228265 $html
229266 );
230267
231- $keys = array();
232- $vals = array();
233268 $lines = explode("]", $html);
234269
235270 foreach($lines as $index => $value){
@@ -241,6 +276,17 @@ function($m){
241276 array_push($vals, $arr[1]);
242277 }
243278 $attrs = array_combine($keys, $vals);
279+ }
280+
281+ public function wrap($html){
282+ $attrs = array();
283+ $keys = array();
284+ $vals = array();
285+
286+ if (preg_match("/<([\w\d]+).*>[^<>]*<\/\\1>/", $html)){
287+ $tag = preg_replace("/<([\w\d]+).*>[^<>]*<\/\\1>/", "$1", $html);
288+ $this->breakUp($tag, $html, $keys, $vals, $attrs);
289+ }
244290
245291 foreach($this->obj as $item){
246292 $wrapper = $this->dom->createElement("$tag");
@@ -255,7 +301,7 @@ function($m){
255301 $this->obj = null;
256302 }
257303
258- public function delEmptyTags (){
304+ public function removeEmptyTags (){
259305 $query = '//*[not(*) and not(@*) and not(text()[normalize-space()])]';
260306 foreach($this->xpath->query("$query") as $tag){
261307 $tag->parentNode->removeChild($tag);
@@ -315,33 +361,12 @@ public function count(){
315361 }
316362
317363 public function echo($format = true){
318- if (isset($this->obj)) {
319- $count = 1;
320- foreach ($this->obj as $item){
321- switch ($item->nodeName) {
322- case '#text':
323- echo $item->textContent;
324- break;
325- default:
326- if ($item->parentNode->hasAttribute($item->nodeName)) {
327- echo "$count. ".$item->parentNode->nodeName."[".$item->nodeName."] => \"".$item->textContent."\"\n";
328- } else {
329- echo "$count. ".$item->nodeName."\n";
330- }
331- break;
332- }
333- $count++;
334- }
335- $this->obj = null;
336- } else {
337- $this->dom->formatOutput = $format;
338- printf ($this->dom->saveXML());
339- }
340- }
341-
342- public function return($format = true){
343364 $this->dom->formatOutput = $format;
344- return $this->dom->saveXML();
365+ printf (($this->ishtml) ? (
366+ $this->dom->saveHTML()
367+ ) : (
368+ $this->dom->saveXML()
369+ ));
345370 }
346371
347372 public function replaceText($pattern, $replace, $html = true){
@@ -354,8 +379,17 @@ public function replaceText($pattern, $replace, $html = true){
354379 $item->textContent = $newtext;
355380 }
356381
357- if($html){
358- $this->dom->loadHTML(html_entity_decode($this->dom->saveXML()));
382+ if ($html) {
383+ if ($this->ishtml) {
384+ $this->dom->loadHTML(
385+ html_entity_decode($this->dom->saveHTML())
386+ );
387+ } else {
388+ $this->dom->loadXML(
389+ html_entity_decode($this->dom->saveXML())
390+ );
391+ }
392+ $this->xpath = new DOMXPath($this->dom);
359393 }
360394 }
361395
@@ -371,11 +405,20 @@ function($m) use ($func){
371405 $item->textContent = $newtext;
372406 }
373407
374- if($html){
375- $this->dom->loadHTML(html_entity_decode($this->dom->saveXML()));
408+ if ($html) {
409+ if ($this->ishtml) {
410+ $this->dom->loadHTML(
411+ html_entity_decode($this->dom->saveHTML())
412+ );
413+ } else {
414+ $this->dom->loadXML(
415+ html_entity_decode($this->dom->saveXML())
416+ );
417+ }
418+ $this->xpath = new DOMXPath($this->dom);
376419 }
377420 }
378-
421+
379422 public function hasClass($class){
380423 foreach($this->obj as $item){
381424 $classes = $item->getAttribute("class");
@@ -391,4 +434,12 @@ public function hasAttr($attr, $val){
391434 $bool = (preg_match("/".preg_quote($val)."/", $attrs)) ? true : false;
392435 return $bool;
393436 }
437+
438+ public function iterate($func){
439+ $i = 1;
440+ foreach($this->obj as $item){
441+ $func($this->_($this->query."[$i]"));
442+ $i++;
443+ }
444+ }
394445}
0 commit comments