forked from subins2000/phpwebcrawler
-
Notifications
You must be signed in to change notification settings - Fork 0
/
crawler.php
48 lines (48 loc) · 1.45 KB
/
crawler.php
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
<?
include("simple_html_dom.php");
$crawled_urls=array();
$found_urls=array();
function rel2abs($rel, $base){
if (parse_url($rel, PHP_URL_SCHEME) != '') return $rel;
if ($rel[0]=='#' || $rel[0]=='?') return $base.$rel;
extract(parse_url($base));
$path = preg_replace('#/[^/]*$#', '', $path);
if ($rel[0] == '/') $path = '';
$abs = "$host$path/$rel";
$re = array('#(/\.?/)#', '#/(?!\.\.)[^/]+/\.\./#');
for($n=1; $n>0;$abs=preg_replace($re,'/', $abs,-1,$n)){}
$abs=str_replace("../","",$abs);
return $scheme.'://'.$abs;
}
function perfect_url($u,$b){
$bp=parse_url($b);
if(($bp['path']!="/" && $bp['path']!="") || $bp['path']==''){
if($bp['scheme']==""){$scheme="http";}else{$scheme=$bp['scheme'];}
$b=$scheme."://".$bp['host']."/";
}
if(substr($u,0,2)=="//"){
$u="http:".$u;
}
if(substr($u,0,4)!="http"){
$u=rel2abs($u,$b);
}
return $u;
}
function crawl_site($u){
global $crawled_urls;
$uen=urlencode($u);
if((array_key_exists($uen,$crawled_urls)==0 || $crawled_urls[$uen] < date("YmdHis",strtotime('-25 seconds', time())))){
$html = file_get_html($u);
$crawled_urls[$uen]=date("YmdHis");
foreach($html->find("a") as $li){
$url=perfect_url($li->href,$u);
$enurl=urlencode($url);
if($url!='' && substr($url,0,4)!="mail" && substr($url,0,4)!="java" && array_key_exists($enurl,$found_urls)==0){
$found_urls[$enurl]=1;
echo "<li><a target='_blank' href='".$url."'>".$url."</a></li>";
}
}
}
}
crawl_site("http://www.subinsb.com");
?>