Skip to content

Commit 70df566

Browse files
Update webscraper.php
1 parent 0f4767d commit 70df566

1 file changed

Lines changed: 109 additions & 58 deletions

File tree

webscraper.php

Lines changed: 109 additions & 58 deletions
Original file line numberDiff line numberDiff line change
@@ -1,21 +1,30 @@
11
class WebScraper {
2-
public $obj, $dom, $xpath;
2+
public $obj, $ishtml = null, $query, $dom, $xpath;
33

4-
public function __construct($param) {
5-
4+
public function __construct() {
65
$this->dom = new DOMDocument();
7-
libxml_use_internal_errors(true);
6+
}
87

9-
if(filter_var($param, FILTER_VALIDATE_URL)){
10-
$this->dom->loadHTMLFile($param, LIBXML_HTML_NOIMPLIED | LIBXML_HTML_NODEFDTD);
11-
} else {
12-
$this->dom->loadXML($param);
13-
}
14-
8+
public function loadHTMLFile($url){
9+
libxml_use_internal_errors(true);
10+
$this->dom->loadHTMLFile($url, LIBXML_HTML_NOIMPLIED | LIBXML_HTML_NODEFDTD);
1511
libxml_use_internal_errors(false);
1612
$this->xpath = new DOMXPath($this->dom);
13+
$this->ishtml = true;
14+
}
15+
16+
public function loadXML($XML){
17+
$this->dom->loadXML($XML);
18+
$this->xpath = new DOMXPath($this->dom);
19+
$this->ishtml = false;
1720
}
1821

22+
public function loadHTML($HTML){
23+
$this->dom->loadHTML($HTML);
24+
$this->xpath = new DOMXPath($this->dom);
25+
$this->ishtml = true;
26+
}
27+
1928
private function convert2XPath($query){
2029
$xpath = $query;
2130

@@ -94,7 +103,16 @@ private function convert2XPath($query){
94103
return $xpath;
95104
}
96105

97-
public function Q($query, $root = false){
106+
public function Q($query){
107+
$this->query = $query;
108+
$query = $this->convert2XPath($query);
109+
110+
$this->obj = $this->xpath->query("//$query");
111+
112+
return $this;
113+
}
114+
115+
private function _($query){
98116
$query = $this->convert2XPath($query);
99117

100118
$this->obj = $this->xpath->query("//$query");
@@ -103,6 +121,7 @@ public function Q($query, $root = false){
103121
}
104122

105123
public function query($query){
124+
$this->query = $query;
106125
$query = $this->convert2XPath($query);
107126

108127
$this->obj = $this->xpath->query("//$query");
@@ -117,6 +136,13 @@ public function setAttribute($attr, $value){
117136
$this->obj = null;
118137
}
119138

139+
public function removeAttribute($attr){
140+
foreach ($this->obj as $item){
141+
$item->removeAttribute("$attr");
142+
}
143+
$this->obj = null;
144+
}
145+
120146
public function addClass($class){
121147
foreach ($this->obj as $item){
122148
$otherClasses = $item->getAttribute("class");
@@ -182,20 +208,32 @@ public function appendHtml($html){
182208
$this->obj = null;
183209
}
184210

211+
public function prependHtml($html){
212+
213+
$dom = new DOMDocument();
214+
$dom->loadXML($html);
215+
$xpath = new DOMXPath($dom);
216+
217+
foreach($this->obj as $item){
218+
foreach($xpath->query("//*") as $contentNode){
219+
$contentNode = $this->dom->importNode($contentNode, true);
220+
$item->insertBefore($contentNode, $item->firstChild);
221+
}
222+
}
223+
224+
$this->obj = null;
225+
}
226+
185227
public function delete($keepinner = false){
186228

187229
foreach($this->obj as $item){
188-
if (!$item->parentNode->hasAttribute($item->nodeName)){
189-
if (!$keepinner){
190-
$item->parentNode->removeChild($item);
191-
} else {
192-
while ($item->firstChild instanceof DOMNode) {
193-
$item->parentNode->insertBefore($item->firstChild, $item);
194-
}
195-
$item->parentNode->removeChild($item);
196-
}
230+
if (!$keepinner){
231+
$item->parentNode->removeChild($item);
197232
} else {
198-
$item->parentNode->removeAttribute($item->nodeName);
233+
while ($item->firstChild instanceof DOMNode) {
234+
$item->parentNode->insertBefore($item->firstChild, $item);
235+
}
236+
$item->parentNode->removeChild($item);
199237
}
200238
}
201239

@@ -212,9 +250,8 @@ public function unwrap(){
212250
$this->obj = null;
213251
}
214252

215-
public function wrap($html){
216-
$tag = preg_replace("/<([\w\d]+).*><\/\\1>/", "$1", $html);
217-
253+
private function breakUp($tag, &$html, &$keys, &$vals, &$attrs){
254+
218255
$html = preg_replace_callback(
219256
'/([^=<>\s]*)=[\'|"]([^=]*)[\'|"]/',
220257
function($m){
@@ -228,8 +265,6 @@ function($m){
228265
$html
229266
);
230267

231-
$keys = array();
232-
$vals = array();
233268
$lines = explode("]", $html);
234269

235270
foreach($lines as $index => $value){
@@ -241,6 +276,17 @@ function($m){
241276
array_push($vals, $arr[1]);
242277
}
243278
$attrs = array_combine($keys, $vals);
279+
}
280+
281+
public function wrap($html){
282+
$attrs = array();
283+
$keys = array();
284+
$vals = array();
285+
286+
if (preg_match("/<([\w\d]+).*>[^<>]*<\/\\1>/", $html)){
287+
$tag = preg_replace("/<([\w\d]+).*>[^<>]*<\/\\1>/", "$1", $html);
288+
$this->breakUp($tag, $html, $keys, $vals, $attrs);
289+
}
244290

245291
foreach($this->obj as $item){
246292
$wrapper = $this->dom->createElement("$tag");
@@ -255,7 +301,7 @@ function($m){
255301
$this->obj = null;
256302
}
257303

258-
public function delEmptyTags(){
304+
public function removeEmptyTags(){
259305
$query = '//*[not(*) and not(@*) and not(text()[normalize-space()])]';
260306
foreach($this->xpath->query("$query") as $tag){
261307
$tag->parentNode->removeChild($tag);
@@ -315,33 +361,12 @@ public function count(){
315361
}
316362

317363
public function echo($format = true){
318-
if (isset($this->obj)) {
319-
$count = 1;
320-
foreach ($this->obj as $item){
321-
switch ($item->nodeName) {
322-
case '#text':
323-
echo $item->textContent;
324-
break;
325-
default:
326-
if ($item->parentNode->hasAttribute($item->nodeName)) {
327-
echo "$count. ".$item->parentNode->nodeName."[".$item->nodeName."] => \"".$item->textContent."\"\n";
328-
} else {
329-
echo "$count. ".$item->nodeName."\n";
330-
}
331-
break;
332-
}
333-
$count++;
334-
}
335-
$this->obj = null;
336-
} else {
337-
$this->dom->formatOutput = $format;
338-
printf ($this->dom->saveXML());
339-
}
340-
}
341-
342-
public function return($format = true){
343364
$this->dom->formatOutput = $format;
344-
return $this->dom->saveXML();
365+
printf (($this->ishtml) ? (
366+
$this->dom->saveHTML()
367+
) : (
368+
$this->dom->saveXML()
369+
));
345370
}
346371

347372
public function replaceText($pattern, $replace, $html = true){
@@ -354,8 +379,17 @@ public function replaceText($pattern, $replace, $html = true){
354379
$item->textContent = $newtext;
355380
}
356381

357-
if($html){
358-
$this->dom->loadHTML(html_entity_decode($this->dom->saveXML()));
382+
if ($html) {
383+
if ($this->ishtml) {
384+
$this->dom->loadHTML(
385+
html_entity_decode($this->dom->saveHTML())
386+
);
387+
} else {
388+
$this->dom->loadXML(
389+
html_entity_decode($this->dom->saveXML())
390+
);
391+
}
392+
$this->xpath = new DOMXPath($this->dom);
359393
}
360394
}
361395

@@ -371,11 +405,20 @@ function($m) use ($func){
371405
$item->textContent = $newtext;
372406
}
373407

374-
if($html){
375-
$this->dom->loadHTML(html_entity_decode($this->dom->saveXML()));
408+
if ($html) {
409+
if ($this->ishtml) {
410+
$this->dom->loadHTML(
411+
html_entity_decode($this->dom->saveHTML())
412+
);
413+
} else {
414+
$this->dom->loadXML(
415+
html_entity_decode($this->dom->saveXML())
416+
);
417+
}
418+
$this->xpath = new DOMXPath($this->dom);
376419
}
377420
}
378-
421+
379422
public function hasClass($class){
380423
foreach($this->obj as $item){
381424
$classes = $item->getAttribute("class");
@@ -391,4 +434,12 @@ public function hasAttr($attr, $val){
391434
$bool = (preg_match("/".preg_quote($val)."/", $attrs)) ? true : false;
392435
return $bool;
393436
}
437+
438+
public function iterate($func){
439+
$i = 1;
440+
foreach($this->obj as $item){
441+
$func($this->_($this->query."[$i]"));
442+
$i++;
443+
}
444+
}
394445
}

0 commit comments

Comments
 (0)